[med-svn] [r-cran-dplyr] 01/05: New upstream version 0.7.4
Andreas Tille
tille at debian.org
Fri Sep 29 13:27:58 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository r-cran-dplyr.
commit 7a424f9064b1acb0da3b3e8a76201d378cb7a39b
Author: Andreas Tille <tille at debian.org>
Date: Fri Sep 29 15:14:14 2017 +0200
New upstream version 0.7.4
---
DESCRIPTION | 55 +-
MD5 | 836 ++++-----
NAMESPACE | 345 ++--
NEWS.md | 455 ++++-
R/RcppExports.R | 211 ++-
R/all-equal.r | 14 +-
R/bench-compare.r | 69 +-
R/bind.r | 161 +-
R/case_when.R | 82 +-
R/coalesce.R | 42 +-
R/colwise-arrange.R | 43 +
R/colwise-filter.R | 76 +
R/colwise-group-by.R | 48 +
R/colwise-mutate.R | 268 +++
R/colwise-select.R | 97 ++
R/colwise.R | 392 ++---
R/compat-dbplyr.R | 72 +
R/compat-lazyeval.R | 90 +
R/compat-purrr.R | 161 ++
R/compute-collect.r | 51 +-
R/copy-to.r | 32 +-
R/count-tally.R | 170 ++
R/data-bands.R | 23 +
R/data-lahman.r | 110 --
R/data-nasa.r | 14 +-
R/data-nycflights13.r | 72 -
R/data-starwars.R | 21 +
R/data-storms.R | 25 +
R/data-temp.r | 72 -
R/data.r | 65 -
R/dataframe.R | 182 +-
R/dbi-s3.r | 288 ---
R/dbplyr.R | 184 ++
R/desc.r | 6 +-
R/distinct.R | 57 +-
R/do.r | 61 +-
R/dplyr.r | 35 +-
R/dr.R | 35 +
R/error.R | 133 ++
R/explain.r | 27 +-
R/failwith.r | 19 +-
R/funs-predicates.R | 48 +
R/funs.R | 190 +-
R/group-by.r | 164 +-
R/group-indices.R | 40 +-
R/group-size.r | 1 +
R/grouped-df.r | 229 ++-
R/hybrid.R | 61 +
R/id.r | 11 +-
R/if_else.R | 42 +-
R/inline.r | 7 +-
R/join.r | 164 +-
R/lazy-ops.R | 244 ---
R/lead-lag.R | 24 +-
R/location.R | 1 +
R/manip.r | 520 ++++--
R/na_if.R | 18 +-
R/near.R | 4 +-
R/nth-value.R | 49 +-
R/order-by.R | 53 +-
R/over.R | 59 -
R/partial-eval.r | 124 --
R/progress.R | 18 +-
R/pull.R | 52 +
R/query.r | 75 -
R/rank.R | 39 +-
R/rbind.R | 71 +
R/recode.R | 118 +-
R/reexport-rlang.R | 11 +
R/{tibble-reexport.r => reexport-tibble.r} | 12 +
R/rowwise.r | 50 +-
R/sample.R | 64 +-
R/select-utils.R | 68 +-
R/select-var.R | 65 +
R/select-vars.R | 214 ++-
R/sets.r | 3 +-
R/sql-build.R | 205 ---
R/sql-escape.r | 248 ---
R/sql-generic.R | 184 --
R/sql-query.R | 125 --
R/sql-render.R | 70 -
R/sql-star.r | 15 -
R/src-local.r | 25 +-
R/src-mysql.r | 225 ---
R/src-postgres.r | 197 ---
R/src-sql.r | 32 -
R/src-sqlite.r | 195 ---
R/src-test.r | 29 -
R/src.r | 11 +-
R/src_dbi.R | 173 ++
R/tally.R | 91 -
R/tbl-cube.r | 192 +-
R/tbl-df.r | 190 +-
R/tbl-lazy.R | 130 --
R/tbl-sql.r | 481 -----
R/tbl.r | 25 +-
R/top-n.R | 51 +-
R/translate-sql-base.r | 246 ---
R/translate-sql-helpers.r | 220 ---
R/translate-sql-window.r | 66 -
R/translate-sql.r | 233 ---
R/ts.R | 4 +
R/utils-expr.R | 50 +
R/utils-format.r | 12 +-
R/utils-replace-with.R | 43 +-
R/utils.r | 151 +-
R/view.r | 47 -
R/zzz.r | 14 +-
README.md | 284 +--
build/vignette.rds | Bin 444 -> 331 bytes
data/band_instruments.rda | Bin 0 -> 194 bytes
data/band_instruments2.rda | Bin 0 -> 210 bytes
data/band_members.rda | Bin 0 -> 194 bytes
data/starwars.rda | Bin 0 -> 3378 bytes
data/storms.rda | Bin 0 -> 40289 bytes
inst/doc/compatibility.R | 114 ++
inst/doc/compatibility.Rmd | 288 +++
inst/doc/compatibility.html | 243 +++
inst/doc/data_frames.R | 23 -
inst/doc/data_frames.Rmd | 79 -
inst/doc/databases.R | 98 --
inst/doc/databases.Rmd | 257 ---
inst/doc/{introduction.R => dplyr.R} | 80 +-
vignettes/introduction.Rmd => inst/doc/dplyr.Rmd | 333 ++--
inst/doc/dplyr.html | 664 +++++++
inst/doc/hybrid-evaluation.R | 12 -
inst/doc/introduction.html | 570 ------
inst/doc/new-sql-backend.R | 34 -
inst/doc/new-sql-backend.Rmd | 123 --
inst/doc/nse.R | 51 -
inst/doc/nse.Rmd | 102 --
inst/doc/programming.R | 314 ++++
inst/doc/programming.Rmd | 587 +++++++
inst/doc/programming.html | 613 +++++++
inst/doc/two-table.Rmd | 22 +-
inst/doc/two-table.html | 291 ++--
inst/doc/window-functions.R | 96 +-
inst/doc/window-functions.Rmd | 218 +--
inst/doc/window-functions.html | 246 +--
inst/include/dplyr.h | 202 +--
inst/include/dplyr/BoolResult.h | 82 +-
inst/include/dplyr/CharacterVectorOrderer.h | 22 +-
inst/include/dplyr/Collecter.h | 993 +++++++----
inst/include/dplyr/Column.h | 26 +
inst/include/dplyr/DataFrameAble.h | 150 --
inst/include/dplyr/DataFrameColumnSubsetVisitor.h | 68 +-
inst/include/dplyr/DataFrameColumnVisitor.h | 68 +-
inst/include/dplyr/DataFrameJoinVisitors.h | 115 +-
inst/include/dplyr/DataFrameSubsetVisitors.h | 220 +--
inst/include/dplyr/DataFrameVisitors.h | 57 +-
inst/include/dplyr/DataFrameVisitorsIndexMap.h | 5 +-
inst/include/dplyr/DataFrameVisitorsIndexSet.h | 10 -
inst/include/dplyr/EmptySubset.h | 7 +-
inst/include/dplyr/FullDataFrame.h | 28 +-
inst/include/dplyr/Gatherer.h | 577 +++---
inst/include/dplyr/GroupedDataFrame.h | 259 +--
inst/include/dplyr/Groups.h | 16 +
inst/include/dplyr/Hybrid.h | 12 +
inst/include/dplyr/HybridHandler.h | 11 +
inst/include/dplyr/HybridHandlerMap.h | 20 +
inst/include/dplyr/JoinVisitor.h | 25 +-
inst/include/dplyr/JoinVisitorImpl.h | 736 +++-----
.../dplyr/MatrixColumnSubsetVectorVisitor.h | 164 +-
inst/include/dplyr/MatrixColumnVisitor.h | 226 ++-
inst/include/dplyr/MultipleVectorVisitors.h | 99 +-
inst/include/dplyr/NamedListAccumulator.h | 91 +-
inst/include/dplyr/Order.h | 141 +-
inst/include/dplyr/OrderVisitor.h | 14 +-
inst/include/dplyr/OrderVisitorImpl.h | 490 +++---
inst/include/dplyr/Replicator.h | 97 +-
inst/include/dplyr/Result/CallElementProxy.h | 22 +-
inst/include/dplyr/Result/CallProxy.h | 66 +-
inst/include/dplyr/Result/CallbackProcessor.h | 194 ++-
inst/include/dplyr/Result/ConstantResult.h | 154 +-
inst/include/dplyr/Result/Count.h | 18 +-
inst/include/dplyr/Result/Count_Distinct.h | 112 +-
inst/include/dplyr/Result/CumMax.h | 68 +-
inst/include/dplyr/Result/CumMin.h | 68 +-
inst/include/dplyr/Result/CumSum.h | 84 +-
inst/include/dplyr/Result/DelayedProcessor.h | 525 +++---
inst/include/dplyr/Result/DelayedReducer.h | 28 -
inst/include/dplyr/Result/Everything.h | 7 -
inst/include/dplyr/Result/GroupedCallProxy.h | 290 +---
inst/include/dplyr/Result/GroupedCallReducer.h | 39 +-
inst/include/dplyr/Result/GroupedHybridCall.h | 339 ++--
inst/include/dplyr/Result/GroupedSubset.h | 254 ++-
inst/include/dplyr/Result/GroupedSubsetBase.h | 21 +
inst/include/dplyr/Result/ILazySubsets.h | 34 +
inst/include/dplyr/Result/In.h | 61 +-
inst/include/dplyr/Result/Lag.h | 164 +-
inst/include/dplyr/Result/LazyGroupedSubsets.h | 214 ++-
inst/include/dplyr/Result/LazyRowwiseSubsets.h | 88 +-
inst/include/dplyr/Result/LazySubsets.h | 124 +-
inst/include/dplyr/Result/Lead.h | 168 +-
inst/include/dplyr/Result/Mean.h | 202 +--
inst/include/dplyr/Result/MinMax.h | 69 +
inst/include/dplyr/Result/Mutater.h | 86 +-
inst/include/dplyr/Result/Processor.h | 203 +--
inst/include/dplyr/Result/Rank.h | 766 ++++----
inst/include/dplyr/Result/Result.h | 27 +-
inst/include/dplyr/Result/ResultSet.h | 28 -
inst/include/dplyr/Result/RowwiseSubset.h | 158 +-
inst/include/dplyr/Result/Sd.h | 32 +-
inst/include/dplyr/Result/Sum.h | 166 +-
inst/include/dplyr/Result/Var.h | 158 +-
inst/include/dplyr/Result/VectorSliceVisitor.h | 60 +-
inst/include/dplyr/Result/all.h | 14 +-
inst/include/dplyr/Result/factories.h | 43 -
inst/include/dplyr/Result/is_smaller.h | 16 +-
inst/include/dplyr/Result/max.h | 82 -
inst/include/dplyr/Result/min.h | 85 -
inst/include/dplyr/RowwiseDataFrame.h | 132 +-
inst/include/dplyr/SubsetVectorVisitor.h | 60 +-
inst/include/dplyr/SubsetVectorVisitorImpl.h | 402 +++--
inst/include/dplyr/SummarisedVariable.h | 18 +-
inst/include/dplyr/VectorVisitor.h | 47 +-
inst/include/dplyr/VectorVisitorImpl.h | 368 ++--
inst/include/dplyr/bad.h | 89 +
inst/include/dplyr/check_supported_type.h | 37 -
inst/include/dplyr/checks.h | 93 +
inst/include/dplyr/comparisons.h | 241 +--
inst/include/dplyr/comparisons_different.h | 92 -
inst/include/dplyr/dplyr.h | 36 +
inst/include/dplyr/get_column.h | 10 +
inst/include/dplyr/join_match.h | 80 +
inst/include/dplyr/main.h | 15 +
inst/include/dplyr/registration.h | 39 +-
inst/include/dplyr/subset_visitor.h | 62 +-
inst/include/dplyr/subset_visitor_impl.h | 88 +
inst/include/dplyr/tbl_cpp.h | 40 +-
inst/include/dplyr/train.h | 92 +-
inst/include/dplyr/vector_class.h | 52 +-
inst/include/dplyr/visitor.h | 59 +-
inst/include/dplyr/visitor_impl.h | 77 +
.../dplyr/visitor_set/VisitorEqualPredicate.h | 22 +-
inst/include/dplyr/visitor_set/VisitorHash.h | 22 +-
inst/include/dplyr/visitor_set/VisitorSetEqual.h | 36 +-
.../dplyr/visitor_set/VisitorSetEqualPredicate.h | 24 +-
inst/include/dplyr/visitor_set/VisitorSetGreater.h | 30 +-
.../dplyr/visitor_set/VisitorSetGreaterPredicate.h | 20 -
inst/include/dplyr/visitor_set/VisitorSetHash.h | 36 +-
inst/include/dplyr/visitor_set/VisitorSetHasher.h | 24 +-
.../include/dplyr/visitor_set/VisitorSetIndexMap.h | 45 +-
.../include/dplyr/visitor_set/VisitorSetIndexSet.h | 37 +-
inst/include/dplyr/visitor_set/VisitorSetLess.h | 30 +-
.../dplyr/visitor_set/VisitorSetLessPredicate.h | 20 -
inst/include/dplyr/visitor_set/VisitorSetMixin.h | 9 +
inst/include/dplyr/visitor_set/visitor_set.h | 13 -
inst/include/dplyr/white_list.h | 46 +-
inst/include/dplyr/workarounds.h | 4 +-
inst/include/dplyr/workarounds/static_assert.h | 4 +-
inst/include/dplyr/workarounds/xlen.h | 22 +
inst/include/dplyr_RcppExports.h | 87 +
inst/include/dplyr_types.h | 8 +
inst/include/solaris/solaris.h | 16 +-
inst/include/tools/Call.h | 88 +-
inst/include/tools/DotsOf.h | 105 --
inst/include/tools/Encoding.h | 74 -
inst/include/tools/FilteredListOf.h | 31 -
inst/include/tools/LazyDots.h | 81 -
inst/include/tools/Quosure.h | 125 ++
inst/include/tools/ShrinkableVector.h | 93 +-
inst/include/tools/SlicingIndex.h | 126 +-
inst/include/tools/StringUTF8.h | 58 -
inst/include/tools/SymbolMap.h | 234 ++-
inst/include/tools/SymbolString.h | 57 +
inst/include/tools/SymbolVector.h | 78 +
inst/include/tools/all_na.h | 12 +-
inst/include/tools/collapse.h | 38 +-
inst/include/tools/complex.h | 10 -
inst/include/tools/debug.h | 13 +
inst/include/tools/delete_all.h | 16 -
inst/include/tools/encoding.h | 65 +
inst/include/tools/get_all_second.h | 18 -
inst/include/tools/get_single_class.h | 35 -
inst/include/tools/hash.h | 30 +-
inst/include/tools/match.h | 27 +-
inst/include/tools/pointer_vector.h | 67 +-
inst/include/tools/rlang-export.h | 21 +
inst/include/tools/scalar_type.h | 22 +
inst/include/tools/tools.h | 19 +-
inst/include/tools/utils.h | 49 +
inst/include/tools/wrap_subset.h | 31 +-
man/add_rownames.Rd | 4 +-
man/all_equal.Rd | 15 +-
man/all_vars.Rd | 32 +
man/arrange.Rd | 40 +-
man/arrange_all.Rd | 56 +
man/as.table.tbl_cube.Rd | 7 +-
man/as.tbl_cube.Rd | 9 +-
man/auto_copy.Rd | 7 +-
man/backend_db.Rd | 106 --
man/backend_dbplyr.Rd | 148 ++
man/backend_sql.Rd | 41 -
man/backend_src.Rd | 20 -
man/band_members.Rd | 33 +
man/bench_compare.Rd | 33 +-
man/between.Rd | 1 -
man/bind.Rd | 88 +-
man/build_sql.Rd | 37 -
man/case_when.Rd | 45 +-
man/check_dbplyr.Rd | 26 +
man/coalesce.Rd | 24 +-
man/common_by.Rd | 1 -
man/compute.Rd | 75 +-
man/copy_to.Rd | 19 +-
man/copy_to.src_sql.Rd | 60 -
man/cumall.Rd | 7 +-
man/desc.Rd | 7 +-
man/dim_desc.Rd | 1 -
man/distinct.Rd | 22 +-
man/do.Rd | 44 +-
man/dplyr-package.Rd | 72 +
man/dplyr.Rd | 28 -
man/dr_dplyr.Rd | 18 +
man/explain.Rd | 15 +-
man/failwith.Rd | 19 +-
man/figures/logo.png | Bin 0 -> 15133 bytes
man/filter.Rd | 67 +-
man/filter_all.Rd | 59 +
man/funs.Rd | 35 +-
man/group_by.Rd | 102 +-
man/group_by_all.Rd | 62 +
man/group_by_prepare.Rd | 7 +-
man/group_indices.Rd | 14 +-
man/group_size.Rd | 2 +-
man/grouped_df.Rd | 30 +-
man/groups.Rd | 22 +-
man/id.Rd | 11 +-
man/ident.Rd | 19 +
man/if_else.Rd | 11 +-
man/init_logging.Rd | 17 +
man/join.Rd | 87 +-
man/join.tbl_df.Rd | 51 +-
man/join.tbl_sql.Rd | 131 --
man/lahman.Rd | 67 -
man/lazy_ops.Rd | 40 -
man/lead-lag.Rd | 13 +-
man/location.Rd | 4 +-
man/make_tbl.Rd | 7 +-
man/mutate.Rd | 98 +-
man/n.Rd | 7 +-
man/n_distinct.Rd | 5 +-
man/na_if.Rd | 11 +-
man/named_commas.Rd | 16 -
man/nasa.Rd | 18 +-
man/near.Rd | 3 +-
man/nth.Rd | 30 +-
man/nycflights13.Rd | 30 -
man/order_by.Rd | 9 +-
man/partial_eval.Rd | 64 -
man/progress_estimated.Rd | 5 +-
man/pull.Rd | 43 +
man/query.Rd | 20 -
man/ranking.Rd | 46 +-
man/recode.Rd | 32 +-
man/reexports.Rd | 46 +-
man/rowwise.Rd | 9 +-
man/same_src.Rd | 1 -
man/sample.Rd | 33 +-
man/scoped.Rd | 63 +
man/se-deprecated.Rd | 121 ++
man/select.Rd | 108 +-
man/select_all.Rd | 70 +
man/select_helpers.Rd | 27 +-
man/select_if.Rd | 32 -
man/select_var.Rd | 49 +
man/select_vars.Rd | 74 +-
man/setops.Rd | 9 +-
man/slice.Rd | 28 +-
man/sql.Rd | 52 +-
man/sql_build.Rd | 53 -
man/sql_quote.Rd | 24 -
man/sql_variant.Rd | 94 -
man/src-test.Rd | 28 -
man/src.Rd | 9 +-
man/src_dbi.Rd | 121 ++
man/src_local.Rd | 5 +-
man/src_memdb.Rd | 34 -
man/src_mysql.Rd | 171 --
man/src_postgres.Rd | 173 --
man/src_sql.Rd | 22 -
man/src_sqlite.Rd | 168 --
man/src_tbls.Rd | 2 +-
man/starwars.Rd | 30 +
man/storms.Rd | 34 +
man/summarise.Rd | 90 +-
man/summarise_all.Rd | 147 +-
man/summarise_each.Rd | 47 +-
man/tally.Rd | 96 +-
man/tbl.Rd | 5 +-
man/tbl_cube.Rd | 26 +-
man/tbl_df.Rd | 57 +-
man/tbl_sql.Rd | 25 -
man/tbl_vars.Rd | 12 +-
man/testing.Rd | 32 -
man/top_n.Rd | 26 +-
man/translate_sql.Rd | 128 --
man/vars.Rd | 26 +-
man/with_order.Rd | 1 -
src/Makevars | 4 +-
src/Makevars.win | 2 +-
src/RcppExports.cpp | 916 +++++-----
src/address.cpp | 87 +-
src/api.cpp | 456 ++---
src/arrange.cpp | 137 +-
src/between.cpp | 21 +-
src/bind.cpp | 615 ++++---
src/combine_variables.cpp | 9 +-
src/distinct.cpp | 69 +-
src/dplyr.cpp | 1830 --------------------
src/encoding.cpp | 59 +
src/filter.cpp | 355 +---
src/group_by.cpp | 43 +
src/group_indices.cpp | 171 +-
src/hybrid.cpp | 198 +++
src/hybrid_count.cpp | 56 +
src/hybrid_debug.cpp | 93 +
src/hybrid_in.cpp | 52 +
src/hybrid_minmax.cpp | 72 +
src/hybrid_nth.cpp | 322 ++++
src/hybrid_offset.cpp | 98 ++
src/hybrid_simple.cpp | 78 +
src/hybrid_window.cpp | 166 ++
src/init.cpp | 60 +-
src/join.cpp | 647 +++----
src/join_exports.cpp | 390 +++++
src/mutate.cpp | 212 +++
src/nth.cpp | 240 ---
src/pch.h | 1 +
src/rlang-export.c | 37 +
src/select.cpp | 114 +-
src/set.cpp | 310 ++++
src/slice.cpp | 195 +++
src/strings_addresses.cpp | 19 -
src/summarise.cpp | 246 ++-
src/test.cpp | 138 +-
src/utils.cpp | 290 ++++
src/window.cpp | 48 +-
tests/testthat/helper-astyle.R | 30 +
tests/testthat/helper-combine.R | 270 +++
tests/testthat/helper-encoding.R | 53 +
tests/testthat/helper-groups.R | 13 +
tests/testthat/helper-hybrid.R | 66 +
tests/testthat/helper-output.R | 5 -
tests/testthat/helper-src.R | 15 -
tests/testthat/helper-torture.R | 1 +
tests/testthat/output/iris--70.txt | 16 -
tests/testthat/output/iris-3-5.txt | 20 -
tests/testthat/output/iris-5-30.txt | 15 -
tests/testthat/output/iris-head-30-80.txt | 11 -
tests/testthat/output/mtcars-8-30.txt | 19 -
tests/testthat/test-DBI.R | 15 +
tests/testthat/test-arrange.r | 116 +-
tests/testthat/test-as-data-frame.R | 44 -
tests/testthat/test-astyle.R | 9 +
tests/testthat/test-between.R | 21 +
tests/testthat/test-binds.R | 425 +++--
tests/testthat/test-case-when.R | 73 +-
tests/testthat/test-coalesce.R | 6 +-
tests/testthat/test-colwise-arrange.R | 13 +
tests/testthat/test-colwise-filter.R | 31 +
tests/testthat/test-colwise-group-by.R | 16 +
tests/testthat/test-colwise-mutate.R | 133 ++
tests/testthat/test-colwise-select.R | 73 +
tests/testthat/test-colwise.R | 86 -
tests/testthat/test-combine.R | 211 ++-
tests/testthat/test-compute.R | 34 -
tests/testthat/test-copy_to.R | 66 +
tests/testthat/test-count-tally.r | 107 ++
tests/testthat/test-data_frame.R | 4 +-
tests/testthat/test-distinct.R | 72 +-
tests/testthat/test-do.R | 83 +-
tests/testthat/test-equality.r | 134 +-
tests/testthat/test-equiv-manip.r | 26 -
tests/testthat/test-filter-windowed.R | 47 -
tests/testthat/test-filter.r | 333 ++--
tests/testthat/test-funs-predicates.R | 23 +
tests/testthat/test-funs.R | 61 +
tests/testthat/test-group-by.r | 221 ++-
tests/testthat/test-group-indices.R | 22 +-
tests/testthat/test-group-size.R | 42 +-
tests/testthat/test-hybrid-traverse.R | 464 +++++
tests/testthat/test-hybrid.R | 888 ++++++++++
tests/testthat/test-if-else.R | 104 +-
tests/testthat/test-internals.r | 11 +-
tests/testthat/test-joins.r | 691 +++++---
tests/testthat/test-lazy-ops.R | 110 --
tests/testthat/test-lazyeval-compat.R | 27 +
tests/testthat/test-lead-lag.R | 60 +-
tests/testthat/test-mutate-windowed.R | 235 ++-
tests/testthat/test-mutate.r | 666 ++++---
tests/testthat/{test-count.r => test-n_distinct.R} | 26 +-
tests/testthat/test-na-if.R | 16 +-
tests/testthat/test-nth-value.R | 26 +-
tests/testthat/test-output.R | 34 -
tests/testthat/test-overscope.R | 13 +
tests/testthat/test-pull.R | 54 +
tests/testthat/test-rank.R | 26 +
tests/testthat/test-rbind.R | 309 ++++
tests/testthat/test-recode.R | 115 +-
tests/testthat/test-sample.R | 61 +-
tests/testthat/test-select-helpers.R | 237 ++-
tests/testthat/test-select.r | 121 +-
tests/testthat/test-sets.R | 86 +-
tests/testthat/test-slice.r | 90 +-
tests/testthat/test-sql-build.R | 186 --
tests/testthat/test-sql-escape.r | 35 -
tests/testthat/test-sql-joins.R | 51 -
tests/testthat/test-sql-render.R | 157 --
tests/testthat/test-sql-translation.r | 119 --
tests/testthat/test-summarise.r | 904 +++++++---
tests/testthat/test-tally.R | 7 -
tests/testthat/test-tbl-cube.R | 84 +-
tests/testthat/test-tbl-sql.r | 11 -
tests/testthat/test-tbl.R | 9 +
tests/testthat/test-top-n.R | 11 +
tests/testthat/test-ts.R | 16 +
tests/testthat/test-underscore.R | 412 +++++
tests/testthat/test-union-all.R | 11 -
tests/testthat/test-utils.R | 13 +
tests/testthat/test-window.R | 58 +-
tests/testthat/utf-8.R | 14 +-
vignettes/compatibility.Rmd | 288 +++
vignettes/data_frames.Rmd | 79 -
{inst/doc => vignettes}/data_frames.html | 60 +-
vignettes/databases.Rmd | 257 ---
{inst/doc => vignettes}/databases.html | 206 +--
vignettes/disabled/benchmark-baseball.Rmd | 182 --
inst/doc/introduction.Rmd => vignettes/dplyr.Rmd | 333 ++--
vignettes/hybrid-evaluation.Rmd | 344 ----
{inst/doc => vignettes}/hybrid-evaluation.html | 14 +-
.../internals}/hybrid-evaluation.Rmd | 11 +-
vignettes/introduction.html | 567 ++++++
vignettes/new-sql-backend.Rmd | 123 --
{inst/doc => vignettes}/new-sql-backend.html | 4 +-
vignettes/notes/mysql-setup.Rmd | 40 -
vignettes/notes/postgres-setup.Rmd | 36 -
vignettes/notes/vagrant-setup.Rmd | 42 -
vignettes/nse.Rmd | 102 --
{inst/doc => vignettes}/nse.html | 8 +-
vignettes/programming.Rmd | 587 +++++++
vignettes/two-table.Rmd | 22 +-
vignettes/window-functions.Rmd | 218 +--
vignettes/windows.graffle | Bin 3036 -> 0 bytes
vignettes/windows.png | Bin 28668 -> 0 bytes
546 files changed, 32049 insertions(+), 27451 deletions(-)
diff --git a/DESCRIPTION b/DESCRIPTION
index 065e4a7..5354956 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,51 +1,38 @@
-Package: dplyr
Type: Package
-Version: 0.5.0
+Package: dplyr
+Version: 0.7.4
Title: A Grammar of Data Manipulation
Description: A fast, consistent tool for working with data frame like objects,
both in memory and out of memory.
Authors at R: c(
person("Hadley", "Wickham", , "hadley at rstudio.com", c("aut", "cre")),
person("Romain", "Francois", , "romain at r-enthusiasts.com", role = "aut"),
- person("RStudio", role = "cph")
+ person("Lionel", "Henry", role = "aut"),
+ person("Kirill", "Müller", role = "aut"),
+ person("RStudio", role = c("cph", "fnd"))
)
-URL: https://github.com/hadley/dplyr
-BugReports: https://github.com/hadley/dplyr/issues
+URL: http://dplyr.tidyverse.org, https://github.com/tidyverse/dplyr
+BugReports: https://github.com/tidyverse/dplyr/issues
+Encoding: UTF-8
Depends: R (>= 3.1.2)
-Imports: assertthat, utils, R6, Rcpp (>= 0.12.3), tibble, magrittr,
- lazyeval (>= 0.1.10), DBI (>= 0.4.1)
-Suggests: RSQLite (>= 1.0.0), RMySQL, RPostgreSQL, testthat, knitr,
- microbenchmark, ggplot2, mgcv, Lahman (>= 3.0-1), nycflights13,
- methods, rmarkdown, covr, dtplyr
+Imports: assertthat, bindrcpp (>= 0.2), glue (>= 1.1.1), magrittr,
+ methods, pkgconfig, rlang (>= 0.1.2), R6, Rcpp (>= 0.12.7),
+ tibble (>= 1.3.1), utils
+Suggests: bit64, covr, dbplyr, dtplyr, DBI, ggplot2, hms, knitr, Lahman
+ (>= 3.0-1), mgcv, microbenchmark, nycflights13, rmarkdown,
+ RMySQL, RPostgreSQL, RSQLite, testthat, withr
VignetteBuilder: knitr
-LinkingTo: Rcpp (>= 0.12.0), BH (>= 1.58.0-1)
+LinkingTo: Rcpp (>= 0.12.0), BH (>= 1.58.0-1), bindrcpp, plogr
LazyData: yes
License: MIT + file LICENSE
-Collate: 'RcppExports.R' 'all-equal.r' 'bench-compare.r' 'bind.r'
- 'case_when.R' 'coalesce.R' 'colwise.R' 'compute-collect.r'
- 'copy-to.r' 'data-lahman.r' 'data-nasa.r' 'data-nycflights13.r'
- 'data-temp.r' 'data.r' 'dataframe.R' 'dbi-s3.r' 'desc.r'
- 'distinct.R' 'do.r' 'dplyr.r' 'explain.r' 'failwith.r' 'funs.R'
- 'group-by.r' 'group-indices.R' 'group-size.r' 'grouped-df.r'
- 'id.r' 'if_else.R' 'inline.r' 'join.r' 'lazy-ops.R'
- 'lead-lag.R' 'location.R' 'manip.r' 'na_if.R' 'near.R'
- 'nth-value.R' 'order-by.R' 'over.R' 'partial-eval.r'
- 'progress.R' 'query.r' 'rank.R' 'recode.R' 'rowwise.r'
- 'sample.R' 'select-utils.R' 'select-vars.R' 'sets.r'
- 'sql-build.R' 'sql-escape.r' 'sql-generic.R' 'sql-query.R'
- 'sql-render.R' 'sql-star.r' 'src-local.r' 'src-mysql.r'
- 'src-postgres.r' 'src-sql.r' 'src-sqlite.r' 'src-test.r'
- 'src.r' 'tally.R' 'tbl-cube.r' 'tbl-df.r' 'tbl-lazy.R'
- 'tbl-sql.r' 'tbl.r' 'tibble-reexport.r' 'top-n.R'
- 'translate-sql-helpers.r' 'translate-sql-base.r'
- 'translate-sql-window.r' 'translate-sql.r' 'utils-format.r'
- 'utils-replace-with.R' 'utils.r' 'view.r' 'zzz.r'
-RoxygenNote: 5.0.1
+RoxygenNote: 6.0.1
NeedsCompilation: yes
-Packaged: 2016-06-23 21:54:41 UTC; hadley
+Packaged: 2017-09-16 15:25:52 UTC; muelleki
Author: Hadley Wickham [aut, cre],
Romain Francois [aut],
- RStudio [cph]
+ Lionel Henry [aut],
+ Kirill Müller [aut],
+ RStudio [cph, fnd]
Maintainer: Hadley Wickham <hadley at rstudio.com>
Repository: CRAN
-Date/Publication: 2016-06-24 15:37:11
+Date/Publication: 2017-09-28 20:43:29 UTC
diff --git a/MD5 b/MD5
index 2194bee..899ede6 100644
--- a/MD5
+++ b/MD5
@@ -1,418 +1,432 @@
-aa3e0c29f28f6fe74b1dfbb4dc903ccc *DESCRIPTION
+5d3e97d0ccb60812c82b801207f08b5d *DESCRIPTION
c7180788a8ec3035d54fc733f8939ade *LICENSE
-f4a5f756f514349951c5ec84b93636ee *NAMESPACE
-df416f49e2ff07286b4bed7cc1cdb4c0 *NEWS.md
-eea1c07405a72cea3874e4fe2b127c8e *R/RcppExports.R
-e25295d71ce091aceda058f407eec2aa *R/all-equal.r
-90c7cd1d5adb69a93ddac6dd895d0fd3 *R/bench-compare.r
-51c0aa1f2cc86ead395abc1fa4524fee *R/bind.r
-8e0377821f88705784c0321953b0663f *R/case_when.R
-1c01679d6e600d1278ef1b723347bc7a *R/coalesce.R
-9ee02acd3a33cdd89f4188f966d88457 *R/colwise.R
-8d4b0dddc334f95ffd57c30cdba0586f *R/compute-collect.r
-e83b9cc95c18a399e261c6edcdb3d70b *R/copy-to.r
-dc548d15d9749f32ba76abb974185c7b *R/data-lahman.r
-5744860420ce850f3f6f83c02d0d59d4 *R/data-nasa.r
-925eea5e8ab3db49debd3f0788721d8d *R/data-nycflights13.r
-57014418e390e2883376859596a05cb0 *R/data-temp.r
-1f566eaecc587981286785466196bb10 *R/data.r
-741613deb1755700b01d149abbe896d9 *R/dataframe.R
-66c418ee3dfa6c110977484671ee4367 *R/dbi-s3.r
-2ad6e93a659bdb0307275e035028b20d *R/desc.r
-5155f59937ca9b2ea26e011d2a021e70 *R/distinct.R
-83933cbc3fdeb37f96a048b8bb0be81c *R/do.r
-5d3f9df6281eb80bb8d96555d499b52a *R/dplyr.r
-f8fe296a35950e828e314ab53e995b2b *R/explain.r
-8605ac8a1596e18fe289b5d587cdd547 *R/failwith.r
-e5a57abaad8b3f758982ef81bfab6ebb *R/funs.R
-dcee10bf855e66836195ade734f8b44f *R/group-by.r
-710ba270cacf7bab761901b0cd900bb8 *R/group-indices.R
-78ef572727d34a8e9846dd41b10556da *R/group-size.r
-c32b6fc737e99b15edd55979a08e4ca4 *R/grouped-df.r
-4ef151c22a8aad93446cff318aa4aace *R/id.r
-ac5fc3bb936bd5fcef047e035a7f7cf6 *R/if_else.R
-acb7601f471d804cf744a93944ae5e24 *R/inline.r
-9d4292c76daccaf4f4e796e013a9830a *R/join.r
-ddc10799bd63e0b3cbd6795d0a8a2c02 *R/lazy-ops.R
-9cf893eef4dd88d55b72ab2fa667452a *R/lead-lag.R
-28ca41c202068578987546b7b28508b2 *R/location.R
-55c7a9d6c3b3326ce348ec2e46bb48ca *R/manip.r
-7d5ece2a1f8ee3525146f32e6b78a619 *R/na_if.R
-0d74fea833d6e0c1aa2a420b35d9f00f *R/near.R
-5a076f2f2e9f036f8c6bf3040c5ed322 *R/nth-value.R
-d1b968745bf80108696be0ba53ef0f25 *R/order-by.R
-000ae5b2290ea72603e5c3d8c2e17ae5 *R/over.R
-88331a590ae7d2ca1b66e4a775e605ee *R/partial-eval.r
-7c21c5cfa4804253f1f1ef6a41f68eb2 *R/progress.R
-35969413ed9b94a4f1c39da6e678537b *R/query.r
-51cc529ae4c9a0c82de7cc42be8bd33a *R/rank.R
-1a640f5204163bbc7fe2801bf147e6c3 *R/recode.R
-d948412fd576d458e14b9c4f5f3fe292 *R/rowwise.r
-9e46ec64ff93bec7b02650a3f0e80564 *R/sample.R
-5c0b1f78027b0d3372689609cf7b0e0e *R/select-utils.R
-3538593fa5c0d5f577629f2c36795296 *R/select-vars.R
-a1e3976fe616d58dc89e5c58204c1635 *R/sets.r
-c072b27e9eae5ac6198af4bd1ff58141 *R/sql-build.R
-7b32a76077c976c69043c217f5554b04 *R/sql-escape.r
-d7d5e732144ce838a08b803a28e903ee *R/sql-generic.R
-635715c01abc677576dcdefe8951a873 *R/sql-query.R
-fd6e2f2b5ddd998a99ed65fcb6d427ab *R/sql-render.R
-1cccd71b126b88bdb03af1fa5dde6aaa *R/sql-star.r
-8fc39ef2a86fc023715a697ab015046c *R/src-local.r
-4cd1f5b2b18876acb136c03b3f6f7b90 *R/src-mysql.r
-9a18e38449055a8da8942abf8de25c80 *R/src-postgres.r
-75c920875cb592b02b7137c45cd47066 *R/src-sql.r
-62fb7bdc50a0e7ea19805c3a6004127b *R/src-sqlite.r
-a37aef47e0fe866c56b7fc3517b7a22b *R/src-test.r
-00d2418632371e083cf37c488d69bb16 *R/src.r
-a3a1ee2edfc7633bedad94912ccd4142 *R/tally.R
-51e69945580769d426125b2dcfb1d731 *R/tbl-cube.r
-4b4580b55e63bca6cc3573de7553a651 *R/tbl-df.r
-418ec218279d6b63afb5e31f506b303f *R/tbl-lazy.R
-ea1faaa194986032b002a7b64e6cc353 *R/tbl-sql.r
-7dc61c707f3bad285436c35c3b1bd668 *R/tbl.r
-71d3ef9a8437788d3b8b3ef745f6b38f *R/tibble-reexport.r
-a7065e2960ad91f3240c43a55cb39b35 *R/top-n.R
-842112f7bea26b822f4867bd00152468 *R/translate-sql-base.r
-39686ef0dcae98db245dbcaa8174150d *R/translate-sql-helpers.r
-857744293de9d98d8a1e7e858b8ef82e *R/translate-sql-window.r
-7a60fe0c640d3cb9bdf9a7e137723bac *R/translate-sql.r
-1df4a2a9110f3023281cdfba071e78ce *R/utils-format.r
-66ef8f7e5aff5721e5ec6dff83c13847 *R/utils-replace-with.R
-49afdd813cc1eb74eda8cdf47ee99caf *R/utils.r
-4d9128b66f58cee5fcb57a8e470c82c4 *R/view.r
-02684685223b06b5af5ac9db8b8e8812 *R/zzz.r
-dd9b7bd8d422a2734aab7e65533db436 *README.md
-6e57a3eebaf81cfc9750211ac41f84b1 *build/vignette.rds
+fda148fa29caf4ffc61a95a65f59857c *NAMESPACE
+704be4dd79bf6ec9b2ed93b881486d43 *NEWS.md
+06c7e661087cb1e3dd6cf582348132fe *R/RcppExports.R
+171ed0512c09d7529184516b3077c745 *R/all-equal.r
+9d5b6b054df3934c26af87b53d1d0bc0 *R/bench-compare.r
+b7f18fc6eceb4fb1c58cc840fba6c0cb *R/bind.r
+ba6ec0ef7145caba488bbf5136912a4a *R/case_when.R
+1dd4345f8b8e3428fcb1d2b8f1c43726 *R/coalesce.R
+161fe7893120baf2016cc1497cd679f7 *R/colwise-arrange.R
+34b4ddebf79f8ad6449d06f380d9e451 *R/colwise-filter.R
+798a68feea2e2674bc915de0e08821ae *R/colwise-group-by.R
+a47f4f42c5db11cfe655b54d88726f5d *R/colwise-mutate.R
+cf50adf586fb7db7a16afcd537ed4a2e *R/colwise-select.R
+5f640cbc7fb1e37e83697beeb8758aa0 *R/colwise.R
+1efca41aa17607b35488db3c3e710760 *R/compat-dbplyr.R
+79ea586e36b0123e161a26e98bd99b64 *R/compat-lazyeval.R
+1f93a1dc9780598f4c7fed9392807141 *R/compat-purrr.R
+dc4ec82504fd1f99c5de95eb3fcfcf41 *R/compute-collect.r
+1d3ba34ddd4ca746c6c5d3cb567c3b8b *R/copy-to.r
+b09b48b61bfb8aaf97a9c5a3eebc0650 *R/count-tally.R
+69ad54c9e8ff6c60be3239e99ceb1d9f *R/data-bands.R
+a08ae483d3ebcd598d3b457c77415561 *R/data-nasa.r
+c11f9a6ce16ea9d659aa93e36f133a42 *R/data-starwars.R
+7132044ff01b9862041b903b64520b60 *R/data-storms.R
+08d9f42d850fba21f350ece052304638 *R/dataframe.R
+15ed50b9e7d6f0db6881250facc997cf *R/dbplyr.R
+4a41a291a9396a2e2692a05eb8a08982 *R/desc.r
+8708fe281729c792e2e85b48cccc3baa *R/distinct.R
+f4267e6fe5e7daffb691491e73394a02 *R/do.r
+1ae45b745dc27114a9a2ace6625b597b *R/dplyr.r
+ce0d3e95149be325fb2338305b748d92 *R/dr.R
+079e6c98c16297edada3de1889cd88f5 *R/error.R
+64db247d9d02897dfa0770acbe43a9b3 *R/explain.r
+55f7f1e4983897a6a1823e06b1931921 *R/failwith.r
+80a1603a7b2131b2c248f41e2806a008 *R/funs-predicates.R
+e946e37a9b3024865dcab814e59ada85 *R/funs.R
+a30aab6313ecd5753f11abb442ec2042 *R/group-by.r
+9fd1ef09b9285ceb7fe43234287c0e64 *R/group-indices.R
+b9eb655a68656718faed6b6293b0bd12 *R/group-size.r
+31f7fc93577732ddb419bdec4364a6e5 *R/grouped-df.r
+913222ea8e85d998b5957658d58c3425 *R/hybrid.R
+1b28eb1d99c911a94e37dc8f66385b05 *R/id.r
+be8693ed1072fbb5314ba8af3c8037ef *R/if_else.R
+cbbb63ead2d57c3f54c79af335b67a46 *R/inline.r
+8ab21eba39c7b173b53ad0abfb1da571 *R/join.r
+b0b31475ba61638b58eeeff8b6d57a5c *R/lead-lag.R
+30ab52450e4bc79ea401076d6f56e0ba *R/location.R
+273dd4d498e07be13c8260cb3f7f0805 *R/manip.r
+ee09f0943c096342251416f4b8fa975d *R/na_if.R
+f947b13d00a896951653b229a0b36634 *R/near.R
+3c76c37d9fb4f6f094a75319aaeddae8 *R/nth-value.R
+967e02eba1e4d8293cf451b0fd4225a8 *R/order-by.R
+9748fc950eb263061a8267ac94d05745 *R/progress.R
+c57cf475606ba40263d44a8e40ea4fdd *R/pull.R
+c64173e9b622207650c072dd7db9e91f *R/rank.R
+7ef0a84818945f9561f220e1900983fb *R/rbind.R
+46a90cccda828e821c2ff1f0fc965601 *R/recode.R
+69a92531f4b91811f4ae95266ed9694a *R/reexport-rlang.R
+6d5af4dec50fa749662d716546b8ed9f *R/reexport-tibble.r
+29b65aaab9243edbc3ded6d3c51a508b *R/rowwise.r
+eba2f88afe502917471a861fda344314 *R/sample.R
+d890bead525d183413ab99274fc3337e *R/select-utils.R
+b6fc3a873dc6b0591259ff0c7c9902bb *R/select-var.R
+eb91fa6eb9457888fe00fff0be31678a *R/select-vars.R
+4472ae6936f469db2f7a12f3474932c2 *R/sets.r
+669645c36ac8ea26f05c67f6ef0e4881 *R/src-local.r
+4944ea83dd1f1f8c7ddbcda96cbfe354 *R/src.r
+24cbf7a190bc3db07d8aeafac47414ac *R/src_dbi.R
+8ed4500398ca393ce30b2b8e5ced900f *R/tbl-cube.r
+3cbe6f1ae2c388e1564f47b10b64a00c *R/tbl-df.r
+2b8db5ecd1cf6aa8e5a93ad5dd6f8436 *R/tbl.r
+2fd9b688171ace51c2dcfa701116b55d *R/top-n.R
+76d10c70d94a48bb5cf644707dee557a *R/ts.R
+575b6d62afd3b686010098581352c171 *R/utils-expr.R
+65f972e2a462c2ccc70ab73a0513b6bb *R/utils-format.r
+9b435f53092efc02b03e60ce76460257 *R/utils-replace-with.R
+d0d68fd08d765f5b8dd8c064b6f5f20f *R/utils.r
+169de4e80fecfb554db79329bdfe73c9 *R/zzz.r
+8eee241c12aa84afedcfce4c40307764 *README.md
+63149a6da765b5a490e498ac3431433e *build/vignette.rds
+a79561c8013e7a7f3c23d509f4918bf8 *data/band_instruments.rda
+3aa4b1478fc31219480e88c876c3aeed *data/band_instruments2.rda
+4d44ad5e4198daccbd4227dca895750b *data/band_members.rda
db40a0145d2a88069865e7f18d3dcf1f *data/nasa.rda
-85456c5b938cb1527a0e13aae6f275cc *inst/doc/data_frames.R
-a7e4e8edbc2da0d11146f682f90890fa *inst/doc/data_frames.Rmd
-de3cc983d10bb6bc6d82048a5cc1e572 *inst/doc/data_frames.html
-09b1766353680875b22a82603054ffba *inst/doc/databases.R
-067a9145d801d3684926b6d7072ff700 *inst/doc/databases.Rmd
-e482c7ba86cada7f1b556da94be26bbe *inst/doc/databases.html
-f1d339ea68b753d414b12541c3a03517 *inst/doc/hybrid-evaluation.R
-cb972bd572ea9281da84ecebf1c2cb9a *inst/doc/hybrid-evaluation.Rmd
-5fb2c1fb7796693a07c0ee71f283f994 *inst/doc/hybrid-evaluation.html
-3a87f78666c4ed620532cc006d9bd6fe *inst/doc/introduction.R
-28297cb14323db649c593799836ea306 *inst/doc/introduction.Rmd
-5aede01f84a9d59794a47ae7dc4a7730 *inst/doc/introduction.html
-813b7ee928acc746bad28da6ae21ab44 *inst/doc/new-sql-backend.R
-cf624089b182d725bddd2f9bb0499087 *inst/doc/new-sql-backend.Rmd
-06f56a93c7d2755d5ef41437aa52055e *inst/doc/new-sql-backend.html
-a4e5fb35f77df28ea8a14a14a32eecb6 *inst/doc/nse.R
-5da8c99efc46ce7a22c208d07c6eabb3 *inst/doc/nse.Rmd
-8257d77617da9a0ca5dde609594fd76c *inst/doc/nse.html
+d23ee77269003009efa2b7a5fd95e84d *data/starwars.rda
+beee782d83b4bd711c01658781fbf643 *data/storms.rda
+049f1cc0f7cd980d607745d078eac484 *inst/doc/compatibility.R
+7258317bbf67e7807cd9c4246e59b89f *inst/doc/compatibility.Rmd
+aa3d6d703cb5bbc84abef25b120dd0dc *inst/doc/compatibility.html
+517cadf4db98fab1155e5ee7d9f6b5da *inst/doc/dplyr.R
+f805eb5d31ac8be006c30e973cea77cc *inst/doc/dplyr.Rmd
+bf7c3a5075a9efdaa4fcc47f643ecec2 *inst/doc/dplyr.html
+b66796b47048c22d48ed8ac0209d69b4 *inst/doc/programming.R
+5da6cf8cd907f814a40a12641483c53c *inst/doc/programming.Rmd
+695a4d59eb0c9e542cb1ddd125336335 *inst/doc/programming.html
3bc4473fe5f8257cd9342036475ce6c1 *inst/doc/two-table.R
-40720863442bd95e35b9163bb92ad00c *inst/doc/two-table.Rmd
-92e682f5000cb6e54ce79eead79f13b3 *inst/doc/two-table.html
-6e3c34527dd49c97dfa4688e589f01ea *inst/doc/window-functions.R
-e9a5e0fadbdb427a3f677e77fbcfe8a7 *inst/doc/window-functions.Rmd
-43fa0a67b7865bc8ff35b5f829f52d4c *inst/doc/window-functions.html
-9e4cfe7bed4853f880af319556d7bc82 *inst/include/dplyr.h
-434db4df7063094f9c8284345599ddfd *inst/include/dplyr/BoolResult.h
-e4cc6fb59b760c0b1f0bf7cdb69bd982 *inst/include/dplyr/CharacterVectorOrderer.h
-fa21bbed46012f02aa145c1c4154d431 *inst/include/dplyr/Collecter.h
-1a3e2ae629dd0584640650842285a0e8 *inst/include/dplyr/DataFrameAble.h
-22de2fce75ee32ac70d2845548cd4389 *inst/include/dplyr/DataFrameColumnSubsetVisitor.h
-5603af9637d01ea542e91c15aac3bb99 *inst/include/dplyr/DataFrameColumnVisitor.h
-7a282d452d5958d084409c5a5e5977c5 *inst/include/dplyr/DataFrameJoinVisitors.h
-af1033fae0022286fac3fa8cbd190fc8 *inst/include/dplyr/DataFrameSubsetVisitors.h
-698874c7587b62fbc3a257574c87537c *inst/include/dplyr/DataFrameVisitors.h
-d0f5d08a9864e35619e7a436c5e270a1 *inst/include/dplyr/DataFrameVisitorsIndexMap.h
-19bea1904ba92b7a8d6ab86a8ff819ca *inst/include/dplyr/DataFrameVisitorsIndexSet.h
-9c2ddb8a034638bbf84c729edb7764e1 *inst/include/dplyr/EmptySubset.h
-e843dd7afc293a3af2533b8c73a1e57a *inst/include/dplyr/FullDataFrame.h
-7df813377efb92f001e1425ed53c12ba *inst/include/dplyr/Gatherer.h
-714d8e6f0d082fa2af20366070765592 *inst/include/dplyr/GroupedDataFrame.h
-693e5671a83b6a213735795ee638be5a *inst/include/dplyr/JoinVisitor.h
-f63b52764c8eb34392604cce66c337a3 *inst/include/dplyr/JoinVisitorImpl.h
-c8ecf98f08b3d4206327b0a35837fff0 *inst/include/dplyr/MatrixColumnSubsetVectorVisitor.h
-5de6a25c80e0380c2231bb082342e053 *inst/include/dplyr/MatrixColumnVisitor.h
-55c5d9654919446e5688497542e3e112 *inst/include/dplyr/MultipleVectorVisitors.h
-22c5d80296aca35f8acf495348e724e6 *inst/include/dplyr/NamedListAccumulator.h
-c01a19b102d76624a46afa3eccadcc41 *inst/include/dplyr/Order.h
-201d2376ace42d716708a0bd208f4f39 *inst/include/dplyr/OrderVisitor.h
-3895e89e90e7b9cad2907a8d12d23788 *inst/include/dplyr/OrderVisitorImpl.h
-790068993fc7107753d829f6fad3d033 *inst/include/dplyr/Replicator.h
-fc71a743f1596aab33fbd74418abcf38 *inst/include/dplyr/Result/CallElementProxy.h
-9a4e0cfc3cdda3d3213c5e4176faf695 *inst/include/dplyr/Result/CallProxy.h
-0a7f29dbd36faddbebbdebfbccd16b2b *inst/include/dplyr/Result/CallbackProcessor.h
-043d8d2a8b3509e242f81ca7afd94f01 *inst/include/dplyr/Result/ConstantResult.h
-a11a733179d390cc1267d9241c127c83 *inst/include/dplyr/Result/Count.h
-1c00c98d6d2f246f31aea7ed6686bfa8 *inst/include/dplyr/Result/Count_Distinct.h
-d77dd70ed61dd47ec4bc7888bd35b41f *inst/include/dplyr/Result/CumMax.h
-dba254d0bce10cdc6af824aa0ffc46f0 *inst/include/dplyr/Result/CumMin.h
-bd1950a6d4f937fc457b3438d55e5e51 *inst/include/dplyr/Result/CumSum.h
-1ec86c2bc54751ee7aa4e9e23b86eb57 *inst/include/dplyr/Result/DelayedProcessor.h
-c5b922cfb213c9e1ccbeaf39cfe2683e *inst/include/dplyr/Result/DelayedReducer.h
-572fd32b2243408774737e6608ab4dd1 *inst/include/dplyr/Result/Everything.h
-aae779ac1588862e56633c410682eca3 *inst/include/dplyr/Result/GroupedCallProxy.h
-dcf33c1ca0588e7730a3860576f4b53c *inst/include/dplyr/Result/GroupedCallReducer.h
-2cdf530b5adb6f4b459f80fa373299f1 *inst/include/dplyr/Result/GroupedHybridCall.h
-a88d38e77cdecde54258732e99b1bd12 *inst/include/dplyr/Result/GroupedSubset.h
-26e5ba41fd3bd045b034fc70c9c9fbc5 *inst/include/dplyr/Result/In.h
-455279e366ee1988d00c404fa50a8b8c *inst/include/dplyr/Result/Lag.h
-973dd35add27a8b29b2ea5d8bb34d963 *inst/include/dplyr/Result/LazyGroupedSubsets.h
-aa40fcd81919f6b661f0ab8d74c14acd *inst/include/dplyr/Result/LazyRowwiseSubsets.h
-adb62c09ceb6953ccdbd2e53b4bbaa1d *inst/include/dplyr/Result/LazySubsets.h
-5649b8422d9a1334a3fd5f5ae5e4ca0a *inst/include/dplyr/Result/Lead.h
-1003d0a715cbba70f829166808f87aaf *inst/include/dplyr/Result/Mean.h
-bad845def0cee1e9f77acdcd00744987 *inst/include/dplyr/Result/Mutater.h
-83fd5618e60de18b2b02b1c0bec2b8ec *inst/include/dplyr/Result/Processor.h
-b9a38f1d89bb5cf8295c7e13170ecda8 *inst/include/dplyr/Result/Rank.h
-c4e62015a5a351237dffefbea4190cd6 *inst/include/dplyr/Result/Result.h
-49a8df17681b1c6223e9e1ba1d588582 *inst/include/dplyr/Result/ResultSet.h
-be5c361ce663cf4b287e0714001e750f *inst/include/dplyr/Result/RowwiseSubset.h
-a97005c067e180e1c38077e666a3decb *inst/include/dplyr/Result/Sd.h
-f5eedfbdfb4e8da8b2939b496e1270b2 *inst/include/dplyr/Result/Sum.h
-1dd11677f09f0416cb78fdd084847f0c *inst/include/dplyr/Result/Var.h
-d1368c2694b94b438429d14a896642d6 *inst/include/dplyr/Result/VectorSliceVisitor.h
-2468561b6e0cafd7fad38a79955f7834 *inst/include/dplyr/Result/all.h
-ad5012e7bff7af21ca2cf65392694490 *inst/include/dplyr/Result/factories.h
-0b10f855496ed489ad83b36a80b75ca6 *inst/include/dplyr/Result/is_smaller.h
-257a37b841098b3bc6d21607778a96fe *inst/include/dplyr/Result/max.h
-f0d649dba67a821cc8cb8c92c9b9fd2a *inst/include/dplyr/Result/min.h
-b8b155a2d0b6fc80ae5cd922b5283a95 *inst/include/dplyr/RowwiseDataFrame.h
-399c9151f66f6cb218d1adec778f435c *inst/include/dplyr/SubsetVectorVisitor.h
-4aca5c436d992500e90d687bb6f66564 *inst/include/dplyr/SubsetVectorVisitorImpl.h
-da7dc744adad9b3ce105271e25ce03f6 *inst/include/dplyr/SummarisedVariable.h
-65bdee0c7bcd15948eafa20a519b772e *inst/include/dplyr/VectorVisitor.h
-fb2962c46cb79b00a8f390251def2e9a *inst/include/dplyr/VectorVisitorImpl.h
-130dc063f5e2a5931a6b59c21781af3b *inst/include/dplyr/check_supported_type.h
-8dca975539ce4b40f43262713c59fe11 *inst/include/dplyr/comparisons.h
-c6f6d0831cc45867f94ad1c67aa1413a *inst/include/dplyr/comparisons_different.h
+2e918ccb3f55c5edb23e90a66c22ec0f *inst/doc/two-table.Rmd
+61f74c4216171f22d875f9d74be64944 *inst/doc/two-table.html
+aaaac3e4250247da6ab5015f7db18a2d *inst/doc/window-functions.R
+d4563df394699b6eab7fe746a4d4170b *inst/doc/window-functions.Rmd
+aafd9d61ae7264741c24044f1285cf90 *inst/doc/window-functions.html
+3e424de5198f078ad18872984990b435 *inst/include/dplyr.h
+fddaabf5a705772fffff51c593a77f73 *inst/include/dplyr/BoolResult.h
+d896149e837aaa3f10d7f5b29a38ca0c *inst/include/dplyr/CharacterVectorOrderer.h
+801ad83d3015466d28889b8a2333dab7 *inst/include/dplyr/Collecter.h
+167b6609eec2594c85fa87efe2d520cf *inst/include/dplyr/Column.h
+cba0b92133be94d980a1d9b99468de36 *inst/include/dplyr/DataFrameColumnSubsetVisitor.h
+e4955005a5a73b7d49db1b7f9d5ad578 *inst/include/dplyr/DataFrameColumnVisitor.h
+b7e33ab5d9130ac39059ff561ede0be0 *inst/include/dplyr/DataFrameJoinVisitors.h
+a51a28c4be90da9beb837215fb530d57 *inst/include/dplyr/DataFrameSubsetVisitors.h
+03795c70e48d07dbaa8fa98bf5130ca5 *inst/include/dplyr/DataFrameVisitors.h
+92eadf90f4af99d52f74b5f4c5a8b28c *inst/include/dplyr/DataFrameVisitorsIndexMap.h
+806d5dbc5b01c68f3bb80d685d1c9e2d *inst/include/dplyr/EmptySubset.h
+2ca4fa86d7ad86b4f5fe775f3e2b46b6 *inst/include/dplyr/FullDataFrame.h
+170934200556d209f6ccafd8cd68b397 *inst/include/dplyr/Gatherer.h
+f34e308a57766726a0fcbb37fd6cb14c *inst/include/dplyr/GroupedDataFrame.h
+e6ae71c9cfe96cad9919c29050b5c396 *inst/include/dplyr/Groups.h
+1e822b22eed00c17ac5602ed4fe4f55f *inst/include/dplyr/Hybrid.h
+efa1190671bd3f0a26300e794cd1f202 *inst/include/dplyr/HybridHandler.h
+3bcb5211adc721bea75a34ed559a2ae1 *inst/include/dplyr/HybridHandlerMap.h
+5ac8463b460c8a8086109f051f707ae0 *inst/include/dplyr/JoinVisitor.h
+04f6122a4792f1f3e1d27a19d8360540 *inst/include/dplyr/JoinVisitorImpl.h
+f919f78fe830c0d4c559c80bbaf997f0 *inst/include/dplyr/MatrixColumnSubsetVectorVisitor.h
+467e6bb3f84b81721d52f04959821385 *inst/include/dplyr/MatrixColumnVisitor.h
+bc4945595deea476775f973ae2f8e192 *inst/include/dplyr/MultipleVectorVisitors.h
+9101693c8280d0fa67cc1dcd4a474b10 *inst/include/dplyr/NamedListAccumulator.h
+665f5671f4c140ac71cb23518d285b87 *inst/include/dplyr/Order.h
+ecc6a7e5ff7403a38dbce54b3b0b7a2e *inst/include/dplyr/OrderVisitor.h
+b58874f6181be9d5445ded064209f8d3 *inst/include/dplyr/OrderVisitorImpl.h
+1aa139740dcfc055a5ec2eadfcc065c7 *inst/include/dplyr/Replicator.h
+242be476f92abda4a9f4dac1f840f141 *inst/include/dplyr/Result/CallElementProxy.h
+6955fc859891d5eabd426309658a2b51 *inst/include/dplyr/Result/CallProxy.h
+0a453dd66d28910a412eddc366dc5f21 *inst/include/dplyr/Result/CallbackProcessor.h
+7e3b917502e8a706545f03070c86e75e *inst/include/dplyr/Result/ConstantResult.h
+f7b49e2ff40baac1f7abcd4bfc34f7be *inst/include/dplyr/Result/Count.h
+8b911e410abef08ef8b0a462b223058b *inst/include/dplyr/Result/Count_Distinct.h
+83d322e3f4a26c252f2f97c7fd023a2e *inst/include/dplyr/Result/CumMax.h
+f8515a0105c5d57ef4d17f99362d7a6d *inst/include/dplyr/Result/CumMin.h
+3e9eb8b95c1b693c548580c3fbe3f20d *inst/include/dplyr/Result/CumSum.h
+55a41bf6a597dbf1a47739b9480f7aaf *inst/include/dplyr/Result/DelayedProcessor.h
+67e089cbf6463847be493133d1fb4dfb *inst/include/dplyr/Result/GroupedCallProxy.h
+d4afd21f5b3f215a3834b25a2d44f4a0 *inst/include/dplyr/Result/GroupedCallReducer.h
+0f97e37d49052391016140c1fe1e94fc *inst/include/dplyr/Result/GroupedHybridCall.h
+27dea30f4231575d3cac6ce5b0944cf9 *inst/include/dplyr/Result/GroupedSubset.h
+53e89ef7391c5ead3951cf459a641928 *inst/include/dplyr/Result/GroupedSubsetBase.h
+53dfed77866a3af77f49784a916b771e *inst/include/dplyr/Result/ILazySubsets.h
+60d1413f6cf7246ed82658cbf659a20b *inst/include/dplyr/Result/In.h
+5b939d0c785337cdfeb8aee9c5d53861 *inst/include/dplyr/Result/Lag.h
+74c6c6c5b2ae8bea7583babd92b00d5f *inst/include/dplyr/Result/LazyGroupedSubsets.h
+cfb60080a459019c3051407179520c38 *inst/include/dplyr/Result/LazyRowwiseSubsets.h
+befe32e04d585521d1749aa51ec20697 *inst/include/dplyr/Result/LazySubsets.h
+5ccf14cebd55563f6300dc6bbe318fd9 *inst/include/dplyr/Result/Lead.h
+98b14e643cc1a8bd5c5ffab5083a50dd *inst/include/dplyr/Result/Mean.h
+47ec3a1d7a84edc0e49c26d68587f7ea *inst/include/dplyr/Result/MinMax.h
+f2e2be64bd0d4703c78e56b38a53a485 *inst/include/dplyr/Result/Mutater.h
+a47c24f0009d1b1fceb49eacb3a7579c *inst/include/dplyr/Result/Processor.h
+d5b3cef289db60d33e2cdcc7243f67ba *inst/include/dplyr/Result/Rank.h
+6b368594157ae7ac67d94e66738f9c94 *inst/include/dplyr/Result/Result.h
+9bcb0b5575dbe0fbecb21c6a796047d4 *inst/include/dplyr/Result/RowwiseSubset.h
+8e999fb06f18b7f8eb8216c9e4b0e7d5 *inst/include/dplyr/Result/Sd.h
+0ecd3084e8eb1b101985d6b0c431cb24 *inst/include/dplyr/Result/Sum.h
+414e7639ee0b67e11aeb9dd7cb60c91f *inst/include/dplyr/Result/Var.h
+0f5ac54316490491778da828738ff8ba *inst/include/dplyr/Result/VectorSliceVisitor.h
+110aa6462aeb62d14150be4bf03574dd *inst/include/dplyr/Result/all.h
+1827f357666423e69a3a397839fc9381 *inst/include/dplyr/Result/is_smaller.h
+949720aee0c1f387e7e3778e74c874dc *inst/include/dplyr/RowwiseDataFrame.h
+d47d007aa274aa12d7380fc10b187f93 *inst/include/dplyr/SubsetVectorVisitor.h
+18a2608f679a204906158df4bb897cfc *inst/include/dplyr/SubsetVectorVisitorImpl.h
+cca57103e287e37257f19f85b2b6b1c1 *inst/include/dplyr/SummarisedVariable.h
+c5ec87ab36c8e60797408040b3c45879 *inst/include/dplyr/VectorVisitor.h
+50bec3f88630e875b4790d12084104d3 *inst/include/dplyr/VectorVisitorImpl.h
+825e5148a9519557834da35e22f6b41c *inst/include/dplyr/bad.h
+d3434e56b7562a8b95ce1e424765f972 *inst/include/dplyr/checks.h
+317d20e6de6ecb1b0c54754485f4e9a8 *inst/include/dplyr/comparisons.h
968c72563fe150a4903095b32b3321ab *inst/include/dplyr/config.h
-e308a0478857e84b10cbea772f2b5d3f *inst/include/dplyr/registration.h
-5709db5eda33e954de6bbb8af7689ab2 *inst/include/dplyr/subset_visitor.h
-ecad722435cd5c65878b4bc3d39b8a85 *inst/include/dplyr/tbl_cpp.h
-558cdd9a70fd63ad40aaa13fa42f1b85 *inst/include/dplyr/train.h
-145c521159390294d1e2974701ac69bc *inst/include/dplyr/vector_class.h
-756e9017d35d0fcbd5e5dd85ff7834f8 *inst/include/dplyr/visitor.h
-d0ebcca3f35315511e9b0e025b23f493 *inst/include/dplyr/visitor_set/VisitorEqualPredicate.h
-7ad8e7c5a6ceace68980921286cbadba *inst/include/dplyr/visitor_set/VisitorHash.h
-9e606a76113494341a6efc3131e42703 *inst/include/dplyr/visitor_set/VisitorSetEqual.h
-21deded1666d317f14de734a6f076260 *inst/include/dplyr/visitor_set/VisitorSetEqualPredicate.h
-d091b40fdd4a14bdc643f202d1df5297 *inst/include/dplyr/visitor_set/VisitorSetGreater.h
-f08039bf39d18abbcff46281b9c06743 *inst/include/dplyr/visitor_set/VisitorSetGreaterPredicate.h
-1dc213313dd60baca3e2ff7cf8e062c3 *inst/include/dplyr/visitor_set/VisitorSetHash.h
-cc655b554ec8e44366863349da486140 *inst/include/dplyr/visitor_set/VisitorSetHasher.h
-ea17f23e601ffc39bb4f97496df36ba0 *inst/include/dplyr/visitor_set/VisitorSetIndexMap.h
-7ac5586b69288246ecae172626bbc84f *inst/include/dplyr/visitor_set/VisitorSetIndexSet.h
-98df99055d9a8f8580daf967ed5c1e47 *inst/include/dplyr/visitor_set/VisitorSetLess.h
-81a8e6443d061806eef6ffb3023cfc05 *inst/include/dplyr/visitor_set/VisitorSetLessPredicate.h
-fde320cfabf9b74315ee683a8562fde9 *inst/include/dplyr/visitor_set/visitor_set.h
-4757256ddd6816a8b0229e0481bc7e8d *inst/include/dplyr/white_list.h
-53968b7c1769e6142d11d8249c58b1c5 *inst/include/dplyr/workarounds.h
-253debba0f058c04dc0898004dbbae5e *inst/include/dplyr/workarounds/static_assert.h
-94be59fe8b8f8f46093ce9c5913960df *inst/include/solaris/solaris.h
-2d6094a78a02a2d0a45a4ce9f9d05374 *inst/include/tools/Call.h
-e386491febed0acff2a2a31611800837 *inst/include/tools/DotsOf.h
-27f98ca6314512cd391fa16c53d879b5 *inst/include/tools/Encoding.h
-c1d634370e5f6a7e492f68e53037e9d3 *inst/include/tools/FilteredListOf.h
-15a7b33d16bfab37d8fa4305709c34e7 *inst/include/tools/LazyDots.h
-e86b05a29010a3d057f116b1c569c1c9 *inst/include/tools/ShrinkableVector.h
-85d6369f6232fea90237b82656ef4cc6 *inst/include/tools/SlicingIndex.h
-4e66b331baf7ee5a5fa7caf2d599f89d *inst/include/tools/StringUTF8.h
-d37d0fb920d9df887e5851fb22a1720b *inst/include/tools/SymbolMap.h
-ba4671f72d966749903fd18f21199266 *inst/include/tools/all_na.h
-36c25f902bf86c33cc255a3df5c30d03 *inst/include/tools/collapse.h
-23c65040c3c55b430aadb7565f4a8bfa *inst/include/tools/complex.h
-2dd28074edcf40dbf98ec483899f5244 *inst/include/tools/delete_all.h
-fcf5cd1f87696dde2761ad49492cdcaf *inst/include/tools/get_all_second.h
-dc04db24bc04b2addaa5341d41835bae *inst/include/tools/get_single_class.h
-cd00f5b80e3fe023bb254a819e17a572 *inst/include/tools/hash.h
-7f67f60607388622ccfe99a026cd133b *inst/include/tools/match.h
-c07891ac9fbee9384a2858692f7979b8 *inst/include/tools/pointer_vector.h
-926c52817eb1b5abb52ca4ec06d55e5b *inst/include/tools/tools.h
-6992a53b8dff0d7e36e9c320c2a35c4e *inst/include/tools/wrap_subset.h
-8ce23af5cab4ff7585c25dc891ce1c4f *man/add_rownames.Rd
-411a944402f7518f5ad82f4a78922932 *man/all_equal.Rd
-522ea853c7802c4e2edd4105d8650692 *man/arrange.Rd
-5e359a50d4fe0095f051b9b4576a57a0 *man/as.table.tbl_cube.Rd
-e35227866e00c51147ac32be29e9d98b *man/as.tbl_cube.Rd
-d53320d7591168ff1bc53567b2ae2006 *man/auto_copy.Rd
-2b06b22588e187a1e9fdf1a5448675df *man/backend_db.Rd
-9a6c2a2c24e7ac61e326f51eea38ad43 *man/backend_sql.Rd
-3fa21c365fb4c540a8edcbc6d115594b *man/backend_src.Rd
-1f1650963ca94ed4af69556249d20a15 *man/bench_compare.Rd
-399b8c4d2b53db9a4f7822503d61dc5a *man/between.Rd
-8cafa2dde17a45486d68fef896998c20 *man/bind.Rd
-ec78dd05dbe5fe1d622f5100f1f46db7 *man/build_sql.Rd
-6a0feb103f055d01a55293a59b3379df *man/case_when.Rd
-f84efdaa1b4145d1cc13b26c94022e82 *man/coalesce.Rd
-ae941fed91e8fac54ecbcfc29e51fcb3 *man/common_by.Rd
-cb923e929dd985020ae7f6d269845b8b *man/compute.Rd
-7c1947b28c4b02c791df99b7537892f2 *man/copy_to.Rd
-ee0cf87661cd201d31a722c993d23fb8 *man/copy_to.src_sql.Rd
-b1e131564b1b7b8b663306cc327aca9b *man/cumall.Rd
-2d2d6022b805189b99bfcb982f98b94d *man/desc.Rd
-a635b3e916fd98c1bf47a9db22394180 *man/dim_desc.Rd
-c3c6123d24af4fdc6c70eed0fe1e5f50 *man/distinct.Rd
-f7d19f1508f8dcd4a4ab8e892b509b04 *man/do.Rd
-01509c2db5d00758955b4843a217422a *man/dplyr.Rd
-96ba4a8dc01eb2308b8f02bcdcd1670e *man/explain.Rd
-43955cbabeb5df463622bd09a0292f42 *man/failwith.Rd
-54f27146ea57c4e16c0a9100cee3f94e *man/filter.Rd
-63015fa4ff3063bb22d0f9c93d404283 *man/funs.Rd
-107956bba96c595a3259bfcd1e37a473 *man/group_by.Rd
-34e23cc76ad3f01981e496d45104a5ee *man/group_by_prepare.Rd
-11af74c489ea074f1e3e93e1dfb1c110 *man/group_indices.Rd
-5b7a01661fcf05cb5967c08c84c5a270 *man/group_size.Rd
-c861146d2d6eae1446b3758747bf0178 *man/grouped_df.Rd
-d0df081033ad5051b49975acb0f4e97b *man/groups.Rd
-51afda733d7ca92d1af77880ab67ea7a *man/id.Rd
-1a10120e2822de6613fe14e96369ffeb *man/if_else.Rd
-c8804c8b460df3e802dd547f8b112624 *man/join.Rd
-458a759e3e09a91530aaf1f57a6b2902 *man/join.tbl_df.Rd
-3b1f879289ae2a021506527e79b12c6e *man/join.tbl_sql.Rd
-afba9084bfd2320acecf444df60144e8 *man/lahman.Rd
-e01e0cd73cad74eceffc94cd05c9249c *man/lazy_ops.Rd
-3256cbf7d15090600dc2d4bf41be4e51 *man/lead-lag.Rd
-45fa52581cd0ba41f1966abf6d92f9f7 *man/location.Rd
-e098697eab31b053cfcd5fc28eed10b4 *man/make_tbl.Rd
-e647a759b936e9c7c74a0155c6bd310f *man/mutate.Rd
-8791976d8c0c70a05e6e774a1d91a301 *man/n.Rd
-293e7a2f410a8ee73b6d76c0ceb8e63d *man/n_distinct.Rd
-7af53334fee19ffc30610abacea0ec17 *man/na_if.Rd
-e4346e4f107e310d08e85895e43c9701 *man/named_commas.Rd
-b1f0b6094157bf0890d5896accf1bd62 *man/nasa.Rd
-1b1b71717170187e9eedbcde21dce728 *man/near.Rd
-5ce5022d013efb91b9d9d6dfda3fe568 *man/nth.Rd
-e5cfda2a843dfd8bb95266569e2ba701 *man/nycflights13.Rd
-f81043960aa361e3bfcf6f68959d5352 *man/order_by.Rd
-07922ac698891fe4dd6a65df6eaf8182 *man/partial_eval.Rd
-c315eeece357d4f85bc787085960b979 *man/progress_estimated.Rd
-4f3b40d58ffe8f601b274bccec429d36 *man/query.Rd
-5b034a773eab0702b53409f3f3824647 *man/ranking.Rd
-5e6785464a71eda6e417fe75b5685198 *man/recode.Rd
-08699fb2972a100035657d7e4ffb60dd *man/reexports.Rd
-90fe7db484a27edbf68a3c7b65064f2b *man/rowwise.Rd
-70c3360d42861862646d4e1d430705e3 *man/same_src.Rd
-5c9c1225d53bcee805f4d914181a33d9 *man/sample.Rd
-9be84de229ee9f015c961b7751d9e060 *man/select.Rd
-e33448221d6a4547ea669e09cf9086e4 *man/select_helpers.Rd
-c35eac86a58998b35f21938cf41b4f58 *man/select_if.Rd
-f759a477554864b0942e23ec4664f0d8 *man/select_vars.Rd
-ecc569856bfdef2694e8e0484f26e5aa *man/setops.Rd
-fda9deefee4674214c8592093086a2d6 *man/slice.Rd
-728d33cb12eb63d74e2cf905e0da00f8 *man/sql.Rd
-3c7f82a9ba5c7b84fc1609768eeef72a *man/sql_build.Rd
-8e2529e18fcbfe308dc3b802b305fb98 *man/sql_quote.Rd
-29491318cbbc8d157e728daa137dd34c *man/sql_variant.Rd
-5453d02ccfd001adbc91797412bbddad *man/src-test.Rd
-98a59d8afb4e3ed90092e4c695406482 *man/src.Rd
-d1afba5a6b75bd8cc0bd11b5b111b474 *man/src_local.Rd
-bb9610f5a98b2f179bebd014e9651af0 *man/src_memdb.Rd
-fc16d61c5946f64de64cca1e632bbaa2 *man/src_mysql.Rd
-6eca9264744e05d8a9fbe4ef8a271381 *man/src_postgres.Rd
-6c36f1375fb7cc796fe83aa7e772c528 *man/src_sql.Rd
-8a9f59ac64215bd62c2917f7a8e1961b *man/src_sqlite.Rd
-d0aeef320441b07c5501a865bf0588b2 *man/src_tbls.Rd
-f6eab7603c9adf31453c78205f102d24 *man/summarise.Rd
-292ffa87d9f2d79bc078dfefca6fd653 *man/summarise_all.Rd
-7cb0e6e9f7b628975dc5e47c8074dc94 *man/summarise_each.Rd
-3307d9748852e6a18f74ecff6a2e884d *man/tally.Rd
-6ec3f1e837d28b715a00ceb211224845 *man/tbl.Rd
-3b2233036393cf1dd9d7ec6c4bd9d2bc *man/tbl_cube.Rd
-ee82a54f7a0c642cbc86918dbe18dc22 *man/tbl_df.Rd
-06d09e52cc51affc8776773b16fd95ad *man/tbl_sql.Rd
-a1c0fa086e53fe3b36e5eeccb4d651cd *man/tbl_vars.Rd
-9ff99d1f4d4f8c3acaa4e7fa5de94f03 *man/testing.Rd
-3aa22337b9d70d0fa4f6fe7ebb9cbdaa *man/top_n.Rd
-092790c72dd83ae3bd631fed3332a15b *man/translate_sql.Rd
-49acaf15923fd32f8983c1654f551dac *man/vars.Rd
-9610f909936484b961ad488a02a8cbb1 *man/with_order.Rd
-c1665f111aeba602a285f284ea076aeb *src/Makevars
-dde90f02a4ecf1906e681d2ad580bccf *src/Makevars.win
-7782fb62a63dfe6cebe57e5905ae63fa *src/RcppExports.cpp
-73a73076fedaf38374f9bb9e2401fc53 *src/address.cpp
-470567d3565d893f98801eb264b8e5b6 *src/api.cpp
-c1f357058325b43337fe471517cee305 *src/arrange.cpp
-4dae1507dbbc5ea7499f1d9722683b0b *src/between.cpp
-e83c2fcb01e29a3b6bf8d7e59326adab *src/bind.cpp
-7a3cce40cf5166b3f54ce554f47a55a1 *src/combine_variables.cpp
-19e59e16e70fc79f10ec037b1f6af281 *src/distinct.cpp
-55dc3776d28f4bd3bf66ce5ba2518a34 *src/dplyr.cpp
-64db7530d2bbc17b33cf216d1e0a824e *src/filter.cpp
-bae3b5ede136f7bf39237f1a8c5eb623 *src/group_indices.cpp
-ca4abe99a03d4cb988ad2d5506f8b073 *src/init.cpp
-b855ae6ffd9b98c86cbf6aaab3ea4372 *src/join.cpp
-fa7cf0929b7388e31793d9db7d9c9259 *src/nth.cpp
-2d6f398bfde0781c5e0973a98681332f *src/select.cpp
-0461f04e9b56c3d3057f6371a33445f6 *src/strings_addresses.cpp
-2249d859a0699e933e69455881d6a6ed *src/summarise.cpp
-7d887eaee44da3d6a987b22944dc562d *src/test.cpp
-692555bfed454b125d3578887ee4eebb *src/window.cpp
+7347cd43b3ea6d8732e7d8c08709f230 *inst/include/dplyr/dplyr.h
+fcbb0b106f5368eb978b20eb62b6a242 *inst/include/dplyr/get_column.h
+3d9793d3f4aed92f771e592d1722315e *inst/include/dplyr/join_match.h
+5743df751edac69e36be84d3ef7dd00e *inst/include/dplyr/main.h
+5145aa0dc0f71460943f3b1b66be0f58 *inst/include/dplyr/registration.h
+90201ca5a1d0b13233666bbb97507d74 *inst/include/dplyr/subset_visitor.h
+23c164352e99c995b73e93566cb0d83c *inst/include/dplyr/subset_visitor_impl.h
+d821ac459208bed5942ba5f94905c550 *inst/include/dplyr/tbl_cpp.h
+8c0b4ad334d9be1f0b5ce771d07dba65 *inst/include/dplyr/train.h
+4a5c1902bca4e2117b136440f257d957 *inst/include/dplyr/vector_class.h
+0800be9d43f8ec965dbd3224c8fd4012 *inst/include/dplyr/visitor.h
+46acf63f305056373c7c7c668b77a268 *inst/include/dplyr/visitor_impl.h
+1e2906bf49719fefa96aca7f79740df2 *inst/include/dplyr/visitor_set/VisitorEqualPredicate.h
+9cf1e53f07ee689dbdfdceabcffb73d7 *inst/include/dplyr/visitor_set/VisitorHash.h
+ffa81819c00e48af72e4283da6335626 *inst/include/dplyr/visitor_set/VisitorSetEqual.h
+bc8aba742b74033a7a0198fe0a6bb1db *inst/include/dplyr/visitor_set/VisitorSetEqualPredicate.h
+c15359ece2c8557e0cfe62dbb9b5496e *inst/include/dplyr/visitor_set/VisitorSetGreater.h
+3994cefa41bb598b1088a63ff3972df8 *inst/include/dplyr/visitor_set/VisitorSetHash.h
+4f41a7cb1a7661898fa6bb9a6e068a94 *inst/include/dplyr/visitor_set/VisitorSetHasher.h
+1d9c35094d67582a8798a15100bae704 *inst/include/dplyr/visitor_set/VisitorSetIndexMap.h
+783a06ac112d4ed1d80d909a1bfef7d7 *inst/include/dplyr/visitor_set/VisitorSetIndexSet.h
+f6b089ffb4ac972e41327e6cf4a7a631 *inst/include/dplyr/visitor_set/VisitorSetLess.h
+602bd49b2fb731f2c7cecc7bca5de7c1 *inst/include/dplyr/visitor_set/VisitorSetMixin.h
+38cdeded34ff55544dec4fb5accd4f76 *inst/include/dplyr/visitor_set/visitor_set.h
+bfa26c7b5053752acd1c656093912684 *inst/include/dplyr/white_list.h
+d97a391a75cd0135078f0f23bc7530a1 *inst/include/dplyr/workarounds.h
+0f10f6bdf53eb88da5c34f5c38092485 *inst/include/dplyr/workarounds/static_assert.h
+acbe82c8b4bffbbdc46a7dcf396f25d5 *inst/include/dplyr/workarounds/xlen.h
+b07e3305423b489f39135f9a953f7e28 *inst/include/dplyr_RcppExports.h
+24c588d16003d98d4091a47052041d06 *inst/include/dplyr_types.h
+82b929bfe98b923eaf0afa2dd619cd02 *inst/include/solaris/solaris.h
+c37a2ff952d99b600fceb515f0b870f2 *inst/include/tools/Call.h
+fbc40f5be8153aad3a774ca447b1612c *inst/include/tools/Quosure.h
+fadd0a6e44d96114fa3c18ee2a0c6238 *inst/include/tools/ShrinkableVector.h
+04f555e8fd6d555cd976a0432c3c02e9 *inst/include/tools/SlicingIndex.h
+3c9f431a82ec52c4d8eaa5fd22eb60e9 *inst/include/tools/SymbolMap.h
+40e4225dde275a5e4e7123f53676ac1d *inst/include/tools/SymbolString.h
+56715125ea66a0c0681039f8bfc31e61 *inst/include/tools/SymbolVector.h
+fe5dc1e4c404abcc064c2aa630c32633 *inst/include/tools/all_na.h
+a5485d3266ffead15544b57963214e6d *inst/include/tools/collapse.h
+3925ac3bb26957c50a2f00b7def28383 *inst/include/tools/debug.h
+987b722e33642d421589272a2755a10f *inst/include/tools/encoding.h
+faf28ee20fd60bd404e1f170f0cd11a1 *inst/include/tools/hash.h
+10e5797b25ff08bbc23e59bcf731d8e3 *inst/include/tools/match.h
+5c156c15b8836635b0a1a555cc8b383f *inst/include/tools/pointer_vector.h
+e138d5a777a12072994f5de444ebee94 *inst/include/tools/rlang-export.h
+7ee868151a2c60de7763603056695fe4 *inst/include/tools/scalar_type.h
+6700d3f80e57fb79110a9f91d896c172 *inst/include/tools/tools.h
+0e7bae069151ecfb3d7e371791c1b6fe *inst/include/tools/utils.h
+906acd5394902f4542ef7f1fd5103f99 *inst/include/tools/wrap_subset.h
+a225fd929ff032a7d0de024080c53317 *man/add_rownames.Rd
+8ebc5399b6e820089bb6cda7578dc0c2 *man/all_equal.Rd
+af4dd958ae305bc8e00b459e72bdd6ae *man/all_vars.Rd
+c39b0f8750bdc36d9cf5f903f0f60a59 *man/arrange.Rd
+d78681a5316be745ada481e026f28a60 *man/arrange_all.Rd
+d231302df4f1df45262ba47bf8ad4a0a *man/as.table.tbl_cube.Rd
+4ef9ac02e06e8231e47bc0ba603f047c *man/as.tbl_cube.Rd
+137be2eff7b7ad639f186670f6d93a00 *man/auto_copy.Rd
+e837390946659cad685c6de672f5d430 *man/backend_dbplyr.Rd
+e81531ed16876cb1bcfc57b89f4e4673 *man/band_members.Rd
+f75e5017449d740c6d1064afe0182250 *man/bench_compare.Rd
+65ccf23d43309ef888abf18a799e8623 *man/between.Rd
+a830fd426f6514d7ba9d47c2dbcef9d3 *man/bind.Rd
+5a6a67d3511423b285fbfe97525f6c4c *man/case_when.Rd
+61714316fecba42a2e0365172c585dbd *man/check_dbplyr.Rd
+23eb23da64f29f623dad9899b90b781d *man/coalesce.Rd
+a9e659ed5ca31b048ac71cb9e66b383e *man/common_by.Rd
+8b57c8dc55db0515b925f21809335928 *man/compute.Rd
+cbe3e88cf9b11546a7e0f3134ddf477c *man/copy_to.Rd
+e08d1d0ae0996911495a866663ca4642 *man/cumall.Rd
+f0c7978518fbd44832836a74288b6bca *man/desc.Rd
+88537f8e714f9460fbce7cd0952dd552 *man/dim_desc.Rd
+d8fb1155a13fa78cb2ae4c01069e962d *man/distinct.Rd
+649c528950338e9507e9cd3629a90032 *man/do.Rd
+db694cbe38c376bb4655f3abb8c52f78 *man/dplyr-package.Rd
+252cd38bb67d05ca39c7099f34e297bd *man/dr_dplyr.Rd
+9e046c5f3b2e55fd1f238d08bbe0ae99 *man/explain.Rd
+9c4243b2d1393ce697564ff1e2f8575a *man/failwith.Rd
+f624141eaf9866aff28d18cce3eefabf *man/figures/logo.png
+f36acf8fa80695728bd515aff51b7a2e *man/filter.Rd
+a55cc63b7721858b8d6d1562f27bacda *man/filter_all.Rd
+05fb5768090e2c050df651f35257ee55 *man/funs.Rd
+fab411c643cc62ee7ba17a03eab9cc56 *man/group_by.Rd
+b9ab921fea3f0f38827852c7e81a057b *man/group_by_all.Rd
+1f541b07bab33ec6158e734fbcc28610 *man/group_by_prepare.Rd
+d9b38100e7ff3cf56ac293fbf1a2127c *man/group_indices.Rd
+14a4975898f60b4ad82c2a67d7bceac3 *man/group_size.Rd
+cc197eff27117a37ddcb3ff12666f4cb *man/grouped_df.Rd
+e2fa16e0b5c4dd20d0f26b42a0eff0d6 *man/groups.Rd
+dfe21ce2272cf6af5ec2d6e84dda7434 *man/id.Rd
+c2a02625ef1ecb6006161f2b851a90c8 *man/ident.Rd
+ac526ee481cb8465b06783bacaa9d5ea *man/if_else.Rd
+fed1d0a4957c37ae234ceb655095f717 *man/init_logging.Rd
+495ecd956475100e1f94e04c0445502b *man/join.Rd
+e7df58f5b92a773fff8deae3b2b3e59d *man/join.tbl_df.Rd
+1fd64a5056b2e7e63cad54a5baef1a0f *man/lead-lag.Rd
+f2baa420397b241a7ce5524389d30165 *man/location.Rd
+accd69a53d9368ad2b36f225c9f99c44 *man/make_tbl.Rd
+b3a252bfe2f7a79e3b223e1bf36d0dfa *man/mutate.Rd
+90e12533ec0313b5c8c188bcba95c4a4 *man/n.Rd
+cef0c5bd879094aee163f5dd0bddf95f *man/n_distinct.Rd
+69a4bf99f26ca5d936f9c658c6b366c4 *man/na_if.Rd
+f611dd22872f69b715412a9aeebb58fa *man/nasa.Rd
+0c99aebdb1662bb9e9fc97334212cc4d *man/near.Rd
+4d3bedee0f9023f032ef29bb1123e66e *man/nth.Rd
+535b4dbee16f84764a5a889687d83159 *man/order_by.Rd
+2366ead0f1e2d68dc23d5d7698509eca *man/progress_estimated.Rd
+44e78e319da9183fe0c5d8bc0738600d *man/pull.Rd
+05c5ef7f142860d24dcccdd200f674d9 *man/ranking.Rd
+533a70b389f89ac19c3382ba4cd17b0d *man/recode.Rd
+2f19c484b2a1ec3e5eb97f15ea3c987e *man/reexports.Rd
+7c84ec63cc9a26d8919f8cc8a86cabdc *man/rowwise.Rd
+f7b4ab90ebcaec811366c956b4d1401a *man/same_src.Rd
+bf6eeb098029398bcbd1f766213c1075 *man/sample.Rd
+34b8debd04bd7237f2e30caf449dc983 *man/scoped.Rd
+efa26c78daefc6ee18871cd068bb5953 *man/se-deprecated.Rd
+5b15b78fb1a3116860edbb137d912e8c *man/select.Rd
+ca47f42b76b9d11fee12192ffe7e596b *man/select_all.Rd
+986ca423177bcef9c746456e520163f8 *man/select_helpers.Rd
+c3d3a06661e57d74b4947fea6cb586da *man/select_var.Rd
+d9f84dfcff684bfa18c1ba442501c8f1 *man/select_vars.Rd
+41151fe390d36a8b8701179eeae71bef *man/setops.Rd
+c8437b69a4dda546d2773b6b8d1703e4 *man/slice.Rd
+58452b35baf6118d7567b5cc0ce5aa9d *man/sql.Rd
+cd2f983efae88f0f38d5556f0dceb39d *man/src.Rd
+504c9da62e2f9065c8afa9805c60119e *man/src_dbi.Rd
+9c7f779fcfff545a30993f402e3725ac *man/src_local.Rd
+d989d2d5844137626d8aef870f08517a *man/src_tbls.Rd
+c52615a2da3cdf11e122bf096a7333ac *man/starwars.Rd
+30ac23855f0884a0e1b2abd179659512 *man/storms.Rd
+0cd173d7b18f3406123977f36639892b *man/summarise.Rd
+33af585518d5afd0c4c3556f2a63767c *man/summarise_all.Rd
+3d462c19ed35905604548989ef663ffb *man/summarise_each.Rd
+f961f0b85a9f7645d8c6e454f2df8251 *man/tally.Rd
+4e70bd464f7c1dfa062b8cd00995742d *man/tbl.Rd
+5318e0f8a9dfe58ed56ec709eb61a103 *man/tbl_cube.Rd
+88cf6cccf58b2795bee20b292bacbd33 *man/tbl_df.Rd
+feef54e529c985897f7cc1e50c99dd31 *man/tbl_vars.Rd
+92bb398f9ecfc3638c16f23ed68633f6 *man/top_n.Rd
+7fac9e0813be539248b4fc7ccb242f80 *man/vars.Rd
+0f716a00c40a985424f71afd4d758a80 *man/with_order.Rd
+c1edeace16ce7538551c133b2ecab8b6 *src/Makevars
+557d367d9b1adb5e426c68161ff8d243 *src/Makevars.win
+805c35a45f53ed93aaf26fe662583608 *src/RcppExports.cpp
+37c770f963e3822189e4becaa5824bf3 *src/address.cpp
+cdb523b1b441d31653f60aa0d0131321 *src/api.cpp
+9a4a578240e819c042c7a5741a9c0836 *src/arrange.cpp
+ffface0cba75e7f9bdb5e4a12b5d53c7 *src/between.cpp
+ab7ec4c805d2e5e553a6c7fa0669c018 *src/bind.cpp
+7e60a66aa85a4fe710883bc74db7edaf *src/combine_variables.cpp
+8b38746b806dfb6b0bd11db3ad00391a *src/distinct.cpp
+4ac8ba8d07f6524095b9be5e05ec1426 *src/encoding.cpp
+1fe234ff15f121c085e21be4a96459e9 *src/filter.cpp
+fa25249ff6a4d9d611cebe539d8a626e *src/group_by.cpp
+4f22a9a58dec466c7ed447e9ef080c0a *src/group_indices.cpp
+6513ab45acd9d88fb8b91af06d9aff30 *src/hybrid.cpp
+a4360a6e99f4d7a08ed07b0dc7ed21b9 *src/hybrid_count.cpp
+207c39c726ae3b20a633a628b0c6e18a *src/hybrid_debug.cpp
+9ae15aa6aa11cf18127b945119b3f2c8 *src/hybrid_in.cpp
+1d476aa586c9ac56bb1f48f52fdf87eb *src/hybrid_minmax.cpp
+ccb4aa2a3bb54075dfc223bf69efebe0 *src/hybrid_nth.cpp
+50673d04689ebc336ef2223925e5de25 *src/hybrid_offset.cpp
+894b53190fbdcfba9a8a31e4f812bade *src/hybrid_simple.cpp
+3a0cf69bfe8609a5d6371fc3d30bb44c *src/hybrid_window.cpp
+941719618362655cfc97318a5558eb05 *src/init.cpp
+13e76eb1c46254e06568d262e4bb76e7 *src/join.cpp
+331cc14af6d275716c6d1f0d1e7dac91 *src/join_exports.cpp
+f29a596f0bdcb23ef4ea72e988908787 *src/mutate.cpp
+3d1f112b1c24d6ccb497bf7688319277 *src/pch.h
+601b3dc608aee72d232bd41d91552ec0 *src/rlang-export.c
+9a5351152b2cafa167ce65394d8485d7 *src/select.cpp
+c1e5f099d0d86504ed2849fb84945218 *src/set.cpp
+13b365367b42e322af5f36e659d446a2 *src/slice.cpp
+ef5de53330c734ed75a28ddaad109d53 *src/summarise.cpp
+765f8574e0425a01008b36e86577d6b0 *src/test.cpp
+4d71eb73382dc2ed05a4abbc86a379a3 *src/utils.cpp
+ae75f69095361796a8b1f1480e6bafbf *src/window.cpp
60c24a9c9c03f728e0d81d86fa6ca4d0 *tests/testthat.R
-37a29834fe43519f14036fa4ed93f4bd *tests/testthat/helper-output.R
-3780a1935c77976294784768425d34d8 *tests/testthat/helper-src.R
-88d09a18f3524c92fe1066d3cb2126e1 *tests/testthat/output/iris--70.txt
-5608ff1de0a20504ee416dc6ba17cca1 *tests/testthat/output/iris-3-5.txt
-4d0f0f48b598faaa237a890b95095e9f *tests/testthat/output/iris-5-30.txt
-eb9dd6e4b4cdf652d0e33d12fc1cb56a *tests/testthat/output/iris-head-30-80.txt
-14edb4c32a58373d3762ea578884b59f *tests/testthat/output/mtcars-8-30.txt
-14be90ffb96bcb79c4f71d2022573c66 *tests/testthat/test-arrange.r
-2c430e661b2e65e9579927154d8cc917 *tests/testthat/test-as-data-frame.R
-542f9b3d925ba52c93bf8fb888d6c7d1 *tests/testthat/test-binds.R
-93f89555930db3be2c1b68363ab85523 *tests/testthat/test-case-when.R
-2af574c56554f84975584f778d7dea67 *tests/testthat/test-coalesce.R
-acbdef2c1b4e4f16f4d62bafbb1e501d *tests/testthat/test-colwise.R
-983839f010c8692a3e3eb0f3bccb57d8 *tests/testthat/test-combine.R
-20eaed6f954eaa3208443701dc6451d0 *tests/testthat/test-compute.R
+e9248c4f9645bef5724077c1960b1446 *tests/testthat/helper-astyle.R
+121df99ae015bed4f0ac880942f96e55 *tests/testthat/helper-combine.R
+18ee78e4be8490b9f052ff557e490b2e *tests/testthat/helper-encoding.R
+75a00613129fa9eb76608bdda725f4f9 *tests/testthat/helper-groups.R
+973b72318f6f908f26492cca68356f77 *tests/testthat/helper-hybrid.R
+5c40bc3557d7e68c3f963b4f428f5c20 *tests/testthat/helper-torture.R
+e9d8127c14ea4a64ffcbf9230229a8a9 *tests/testthat/test-DBI.R
+964dd84f4a546557d44e3d8ff6edd7df *tests/testthat/test-arrange.r
+49113f7f672913fe11e374d7f5c3c544 *tests/testthat/test-astyle.R
+5433288875c5cedb640f9aad2f48550c *tests/testthat/test-between.R
+3316a317dc0b57b58695b68535e3dda9 *tests/testthat/test-binds.R
+815b47c2e3137fddf8085c8aab5bcad5 *tests/testthat/test-case-when.R
+84b7a73b11d5900d181d8c9e53abf837 *tests/testthat/test-coalesce.R
+a5f96eb5b0e704d5fe408802e2a589d4 *tests/testthat/test-colwise-arrange.R
+e1860ea3cc41780b8e32c884383b51db *tests/testthat/test-colwise-filter.R
+8a69525168699b97c0d02599da073707 *tests/testthat/test-colwise-group-by.R
+85641404de7e30f993acf36c28ad9d3e *tests/testthat/test-colwise-mutate.R
+2fb8be6a8e06c682dd42d483c5f81062 *tests/testthat/test-colwise-select.R
+819ddbe79fc237fd2d74407b03bc8c8a *tests/testthat/test-combine.R
+5d39e8d49dbe9b34ec924632e529d3a0 *tests/testthat/test-copy_to.R
39988efc666e80566c47eb579443096d *tests/testthat/test-copying.R
-2aea1417aa3f0b306eb11bc32b0b714a *tests/testthat/test-count.r
-45c2feacb226ad0604300fb8831e440d *tests/testthat/test-data_frame.R
-844e2e1e4da562b07015eaf758597e21 *tests/testthat/test-distinct.R
-ec800e4eb31fd38aeeb932e223b50558 *tests/testthat/test-do.R
-3cea2a8daf89335c4e07e3ab2495030b *tests/testthat/test-equality.r
-304ad26a928e586d1b7de22887978b04 *tests/testthat/test-equiv-manip.r
-42ae3ae1ab24e37078ad8a243ef4783a *tests/testthat/test-filter-windowed.R
-e96492d2f7789a76ae4a2c7c84a433ba *tests/testthat/test-filter.r
-d15c83734d3b28ce7ebf449eb43288c5 *tests/testthat/test-group-by.r
-70de41db1766e1bf2125ae18a3604210 *tests/testthat/test-group-indices.R
-241b7b917e61b322d68d37a4d4f032ab *tests/testthat/test-group-size.R
-dcf8c54ad7fd6e93a9e71f3e38951faf *tests/testthat/test-if-else.R
-337577ba6a8c296e44dfb23040b0d0fb *tests/testthat/test-internals.r
-cd62b66dd39aab4960e55f59638479b2 *tests/testthat/test-joins.r
-12df6121c9004a42a27ca3c2f0bafd8c *tests/testthat/test-lazy-ops.R
-67f701fee3f4db9c076c93c731a0d5ba *tests/testthat/test-lead-lag.R
-21c47293a1f13c29b4414648ccd15bba *tests/testthat/test-mutate-windowed.R
-6e516015ddd462ea76b7da771208afa4 *tests/testthat/test-mutate.r
-453665e29a54e02cbe6d81a675117720 *tests/testthat/test-na-if.R
+164e1bd555e983b9c20d7ac6841d800f *tests/testthat/test-count-tally.r
+6ed341fde2d49835a4c45d3c6396a4d5 *tests/testthat/test-data_frame.R
+7a8dc5c7aa7d64e221cde29c5629a1f7 *tests/testthat/test-distinct.R
+8d9b6bc55096d402cd5d4a66b7c1a57e *tests/testthat/test-do.R
+b78a76dcabe8d7cad8c398d28c2ca64c *tests/testthat/test-equality.r
+aeaf9492caa11242adf833a2d169f463 *tests/testthat/test-filter.r
+b8d8b511d6dcb0de88b6c3a6287d98b2 *tests/testthat/test-funs-predicates.R
+3959e8c0efa25ed3ee7842b7b7e97910 *tests/testthat/test-funs.R
+1711b04ed60fe2b9fb1a92253371c148 *tests/testthat/test-group-by.r
+6ea56bf978c055050c2d8dba2c79519e *tests/testthat/test-group-indices.R
+bd21aafc0d45b03265d23f5f34b39f8f *tests/testthat/test-group-size.R
+e6ba9b650d38ca24558cf1163453ab13 *tests/testthat/test-hybrid-traverse.R
+08c35372de213c444cbb3e91fdfe3680 *tests/testthat/test-hybrid.R
+b0ed2c4ba15717aa1a196f35a910fd0b *tests/testthat/test-if-else.R
+9083be60b404381c0aa9d7a88edd195b *tests/testthat/test-internals.r
+15fb29def1b7fcd328e4b06924f89724 *tests/testthat/test-joins.r
+2377b2e5e7d9cb8c25be2a17081e0774 *tests/testthat/test-lazyeval-compat.R
+b465e7fdb03a31ec176f76e586b95189 *tests/testthat/test-lead-lag.R
+ef85acb333eb45533b395efdbc5b81be *tests/testthat/test-mutate-windowed.R
+70e3a671adf09e2b0b423632fe24bb68 *tests/testthat/test-mutate.r
+048f33fdb80ba215845bcb54acfe4079 *tests/testthat/test-n_distinct.R
+7e5620dc7a74958744f339d8df3a485c *tests/testthat/test-na-if.R
11f0ac527e16acc38b76e69f81cfa3a6 *tests/testthat/test-near.R
-8345bbbe10d39fa9d6576cb423b33946 *tests/testthat/test-nth-value.R
-b97b8b7c03e243e25dc2fad3e36664b0 *tests/testthat/test-output.R
-8eb398aa7588f7ef8e5107dcac27901a *tests/testthat/test-recode.R
-050d27adc4f6dc7b8d93fad89c92d352 *tests/testthat/test-sample.R
-8296c81632ee2698195e29cf17b944fe *tests/testthat/test-select-helpers.R
-7d48ba306bfa575208bfd5d7a3f24179 *tests/testthat/test-select.r
-7ba39f5e266566f95969de554e1ad8e9 *tests/testthat/test-sets.R
-2c7e8e4e10f6098dea2886945554e585 *tests/testthat/test-slice.r
-58f93f0decccb857c5a74002b59a5a55 *tests/testthat/test-sql-build.R
-9a61c11420ecc6cc2f21b4f9eb844a65 *tests/testthat/test-sql-escape.r
-30e45d9c8d04f1b86159791fdc26e9b9 *tests/testthat/test-sql-joins.R
-ccb7740375dc0dce945c8520205bc381 *tests/testthat/test-sql-render.R
-80541902e568dd40c85343b84081a0e1 *tests/testthat/test-sql-translation.r
-4070023b35d45a47fbc6c73b668425f7 *tests/testthat/test-summarise.r
-636bf18127933c99fbdf89ac3fee4a45 *tests/testthat/test-tally.R
-a2311277247e29e121e63b756459c799 *tests/testthat/test-tbl-cube.R
-9e515933117fd4fd01f19c5a5e68a5e5 *tests/testthat/test-tbl-sql.r
-d1dc1ba1f05cf1003fca70943d0e1c17 *tests/testthat/test-top-n.R
-bda7de675fec59a2da9977ae56b64a02 *tests/testthat/test-union-all.R
-27120b98d45aa8fec61d11202aedca4c *tests/testthat/test-window.R
-6a4c95d32650caa285df9805c0d95245 *tests/testthat/utf-8.R
-a7e4e8edbc2da0d11146f682f90890fa *vignettes/data_frames.Rmd
-067a9145d801d3684926b6d7072ff700 *vignettes/databases.Rmd
-0e3c1a8694d4691c565b7bd8ba0640d8 *vignettes/disabled/benchmark-baseball.Rmd
-cb972bd572ea9281da84ecebf1c2cb9a *vignettes/hybrid-evaluation.Rmd
-28297cb14323db649c593799836ea306 *vignettes/introduction.Rmd
-cf624089b182d725bddd2f9bb0499087 *vignettes/new-sql-backend.Rmd
-7a0f22ccbb11e6ac921f0e5bd9bb5e9a *vignettes/notes/mysql-setup.Rmd
-9f10920a2e83db02b2a154aed9b7e0d7 *vignettes/notes/postgres-setup.Rmd
-263ae40ea899d02ebb5c68dede18c486 *vignettes/notes/vagrant-setup.Rmd
-5da8c99efc46ce7a22c208d07c6eabb3 *vignettes/nse.Rmd
-40720863442bd95e35b9163bb92ad00c *vignettes/two-table.Rmd
-e9a5e0fadbdb427a3f677e77fbcfe8a7 *vignettes/window-functions.Rmd
-83cdde894e0c44ffda5a9dbae3c80092 *vignettes/windows.graffle
-2cc473a6bd316193615aee5045fcc835 *vignettes/windows.png
+ec98e22acefd5e4b62d0149c45224df8 *tests/testthat/test-nth-value.R
+695db6bd6c8de70a92802a23e59030ab *tests/testthat/test-overscope.R
+b4cee9ceff927660d5b6eba5a04a5df7 *tests/testthat/test-pull.R
+4cbceb6bef219dccbdf27f4ef9002d7a *tests/testthat/test-rank.R
+3a3c2b6b0eae4bbf5cf0ee101523d0dc *tests/testthat/test-rbind.R
+33386a4fb6ce23987ac6c8b886be2d4b *tests/testthat/test-recode.R
+00004a802030a18b41d6cc0ebf1ad204 *tests/testthat/test-sample.R
+ef4fd61961117f15d36459620a160d53 *tests/testthat/test-select-helpers.R
+e309a886b815146eb136da15c9c4382b *tests/testthat/test-select.r
+b9dcc058be8446fbc97401a9898f8286 *tests/testthat/test-sets.R
+5a2b659c5e745d8b547d778e7914f1f8 *tests/testthat/test-slice.r
+abd126736dfa20b96d3d7e61231182ed *tests/testthat/test-summarise.r
+68d9af97a6e3003c798783e899e2f1b1 *tests/testthat/test-tbl-cube.R
+d7824b3609e1fdd9603588f3a102de01 *tests/testthat/test-tbl.R
+35e7016b701621b28adb725f49277551 *tests/testthat/test-top-n.R
+059ed6902eef67772ce4077422722ecc *tests/testthat/test-ts.R
+38c491aa6e241442fb2fbfbd45f6b833 *tests/testthat/test-underscore.R
+7dff4a2ecad3803f9f55eafbefc50679 *tests/testthat/test-union-all.R
+3d20e48e3df6c604bdb27cfe64732a81 *tests/testthat/test-utils.R
+1cf618d8eb65caf1c9f6e0e16b08792f *tests/testthat/test-window.R
+bb3be50f7600a3618d609b259e1a900c *tests/testthat/utf-8.R
+7258317bbf67e7807cd9c4246e59b89f *vignettes/compatibility.Rmd
+e3ba5b6ccdcd5ca3dbcec1e438053720 *vignettes/data_frames.html
+1e84672ae71e2186bc531a0b620aa09d *vignettes/databases.html
+f805eb5d31ac8be006c30e973cea77cc *vignettes/dplyr.Rmd
+1e9d4a90c59c600645a8be474020f47b *vignettes/hybrid-evaluation.html
+701025fb5fd2d309304870452c4f6d82 *vignettes/internals/hybrid-evaluation.Rmd
+62f994023cb169a3a16d24d2a3a28814 *vignettes/introduction.html
+7c41debe1a8f9cfe1d042f49ed66060c *vignettes/new-sql-backend.html
+1abfa9596e7559b1a21389fb54df1652 *vignettes/nse.html
+5da6cf8cd907f814a40a12641483c53c *vignettes/programming.Rmd
+2e918ccb3f55c5edb23e90a66c22ec0f *vignettes/two-table.Rmd
+d4563df394699b6eab7fe746a4d4170b *vignettes/window-functions.Rmd
diff --git a/NAMESPACE b/NAMESPACE
index 8d0f03d..eb2bf3d 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -5,18 +5,16 @@ S3method("[",grouped_df)
S3method(all.equal,tbl_df)
S3method(anti_join,data.frame)
S3method(anti_join,tbl_df)
-S3method(anti_join,tbl_lazy)
+S3method(arrange,data.frame)
+S3method(arrange,default)
+S3method(arrange,grouped_df)
+S3method(arrange,tbl_df)
S3method(arrange_,data.frame)
S3method(arrange_,tbl_df)
-S3method(arrange_,tbl_lazy)
S3method(as.data.frame,grouped_df)
S3method(as.data.frame,rowwise_df)
S3method(as.data.frame,tbl_cube)
S3method(as.data.frame,tbl_df)
-S3method(as.data.frame,tbl_sql)
-S3method(as.fun_list,"function")
-S3method(as.fun_list,character)
-S3method(as.fun_list,fun_list)
S3method(as.table,tbl_cube)
S3method(as.tbl,data.frame)
S3method(as.tbl,tbl)
@@ -28,295 +26,188 @@ S3method(as_data_frame,grouped_df)
S3method(as_data_frame,tbl_cube)
S3method(auto_copy,tbl_cube)
S3method(auto_copy,tbl_df)
-S3method(auto_copy,tbl_sql)
-S3method(c,sql)
S3method(cbind,grouped_df)
S3method(collapse,data.frame)
-S3method(collapse,tbl_sql)
S3method(collect,data.frame)
-S3method(collect,tbl_sql)
+S3method(common_by,"NULL")
+S3method(common_by,character)
+S3method(common_by,default)
+S3method(common_by,list)
S3method(compute,data.frame)
-S3method(compute,tbl_sql)
+S3method(copy_to,DBIConnection)
S3method(copy_to,src_local)
-S3method(copy_to,src_sql)
-S3method(db_analyze,DBIConnection)
-S3method(db_analyze,MySQLConnection)
-S3method(db_begin,DBIConnection)
-S3method(db_begin,MySQLConnection)
-S3method(db_begin,PostgreSQLConnection)
-S3method(db_commit,DBIConnection)
-S3method(db_commit,MySQLConnection)
-S3method(db_create_index,DBIConnection)
-S3method(db_create_index,MySQLConnection)
-S3method(db_create_indexes,DBIConnection)
-S3method(db_create_table,DBIConnection)
-S3method(db_data_type,DBIConnection)
-S3method(db_data_type,MySQLConnection)
-S3method(db_drop_table,DBIConnection)
-S3method(db_explain,DBIConnection)
-S3method(db_explain,PostgreSQLConnection)
-S3method(db_has_table,DBIConnection)
-S3method(db_has_table,MySQLConnection)
-S3method(db_has_table,PostgreSQLConnection)
-S3method(db_insert_into,MySQLConnection)
-S3method(db_insert_into,PostgreSQLConnection)
-S3method(db_insert_into,SQLiteConnection)
-S3method(db_list_tables,DBIConnection)
-S3method(db_query_fields,DBIConnection)
-S3method(db_query_fields,DBITestConnection)
-S3method(db_query_fields,PostgreSQLConnection)
-S3method(db_query_rows,DBIConnection)
-S3method(db_rollback,DBIConnection)
-S3method(db_save_query,DBIConnection)
+S3method(default_missing,data.frame)
+S3method(default_missing,default)
S3method(dim,tbl_cube)
-S3method(dim,tbl_sql)
-S3method(dimnames,tbl_sql)
+S3method(distinct,data.frame)
+S3method(distinct,default)
+S3method(distinct,grouped_df)
+S3method(distinct,tbl_df)
S3method(distinct_,data.frame)
S3method(distinct_,grouped_df)
S3method(distinct_,tbl_df)
-S3method(distinct_,tbl_lazy)
+S3method(do,"NULL")
+S3method(do,data.frame)
+S3method(do,default)
+S3method(do,grouped_df)
+S3method(do,rowwise_df)
S3method(do_,"NULL")
S3method(do_,data.frame)
S3method(do_,grouped_df)
S3method(do_,rowwise_df)
-S3method(do_,tbl_sql)
-S3method(escape,"NULL")
-S3method(escape,Date)
-S3method(escape,POSIXt)
-S3method(escape,character)
-S3method(escape,double)
-S3method(escape,factor)
-S3method(escape,ident)
-S3method(escape,integer)
-S3method(escape,list)
-S3method(escape,logical)
-S3method(escape,sql)
-S3method(explain,tbl_sql)
+S3method(filter,data.frame)
+S3method(filter,default)
+S3method(filter,tbl_cube)
+S3method(filter,tbl_df)
+S3method(filter,ts)
S3method(filter_,data.frame)
S3method(filter_,tbl_cube)
S3method(filter_,tbl_df)
-S3method(filter_,tbl_lazy)
-S3method(format,ident)
-S3method(format,sql)
S3method(format,src_local)
-S3method(format,src_sql)
S3method(full_join,data.frame)
S3method(full_join,tbl_df)
-S3method(full_join,tbl_lazy)
+S3method(group_by,data.frame)
+S3method(group_by,default)
+S3method(group_by,rowwise_df)
+S3method(group_by,tbl_cube)
S3method(group_by_,data.frame)
S3method(group_by_,rowwise_df)
S3method(group_by_,tbl_cube)
-S3method(group_by_,tbl_lazy)
+S3method(group_indices,data.frame)
+S3method(group_indices,default)
+S3method(group_indices,grouped_df)
S3method(group_indices_,data.frame)
S3method(group_indices_,grouped_df)
S3method(group_size,data.frame)
S3method(group_size,grouped_df)
S3method(group_size,rowwise_df)
-S3method(group_size,tbl_sql)
+S3method(group_vars,default)
+S3method(group_vars,grouped_df)
+S3method(group_vars,tbl_cube)
S3method(groups,data.frame)
S3method(groups,grouped_df)
S3method(groups,tbl_cube)
-S3method(groups,tbl_lazy)
-S3method(head,tbl_lazy)
S3method(inner_join,data.frame)
S3method(inner_join,tbl_df)
-S3method(inner_join,tbl_lazy)
S3method(intersect,data.frame)
S3method(intersect,default)
-S3method(intersect,tbl_lazy)
S3method(left_join,data.frame)
S3method(left_join,tbl_df)
-S3method(left_join,tbl_lazy)
+S3method(mutate,data.frame)
+S3method(mutate,default)
+S3method(mutate,tbl_df)
S3method(mutate_,data.frame)
S3method(mutate_,tbl_df)
-S3method(mutate_,tbl_lazy)
S3method(n_groups,data.frame)
S3method(n_groups,grouped_df)
S3method(n_groups,rowwise_df)
-S3method(n_groups,tbl_sql)
-S3method(names,sql_variant)
-S3method(op_grps,op_base)
-S3method(op_grps,op_double)
-S3method(op_grps,op_group_by)
-S3method(op_grps,op_single)
-S3method(op_grps,op_summarise)
-S3method(op_grps,op_ungroup)
-S3method(op_grps,tbl_lazy)
-S3method(op_sort,op_arrange)
-S3method(op_sort,op_base)
-S3method(op_sort,op_double)
-S3method(op_sort,op_single)
-S3method(op_sort,op_summarise)
-S3method(op_sort,tbl_lazy)
-S3method(op_vars,op_base)
-S3method(op_vars,op_join)
-S3method(op_vars,op_mutate)
-S3method(op_vars,op_rename)
-S3method(op_vars,op_select)
-S3method(op_vars,op_semi_join)
-S3method(op_vars,op_set_op)
-S3method(op_vars,op_single)
-S3method(op_vars,op_summarise)
-S3method(op_vars,tbl_lazy)
S3method(print,BoolResult)
+S3method(print,all_vars)
+S3method(print,any_vars)
S3method(print,fun_list)
-S3method(print,grouped_df)
-S3method(print,join_query)
S3method(print,location)
-S3method(print,op_base_local)
-S3method(print,op_base_remote)
-S3method(print,op_single)
S3method(print,rowwise_df)
-S3method(print,select_query)
-S3method(print,semi_join_query)
-S3method(print,set_op_query)
-S3method(print,sql)
-S3method(print,sql_variant)
S3method(print,src)
S3method(print,tbl_cube)
-S3method(print,tbl_lazy)
-S3method(print,tbl_sql)
-S3method(query,DBIConnection)
+S3method(pull,data.frame)
S3method(rbind,grouped_df)
S3method(recode,character)
S3method(recode,factor)
S3method(recode,numeric)
+S3method(rename,data.frame)
+S3method(rename,default)
+S3method(rename,grouped_df)
+S3method(rename,tbl_cube)
S3method(rename_,data.frame)
S3method(rename_,grouped_df)
S3method(rename_,tbl_cube)
-S3method(rename_,tbl_lazy)
S3method(right_join,data.frame)
S3method(right_join,tbl_df)
-S3method(right_join,tbl_lazy)
S3method(same_src,data.frame)
-S3method(same_src,src_sql)
S3method(same_src,tbl_cube)
-S3method(same_src,tbl_lazy)
-S3method(same_src,tbl_sql)
S3method(sample_frac,data.frame)
S3method(sample_frac,default)
S3method(sample_frac,grouped_df)
S3method(sample_n,data.frame)
S3method(sample_n,default)
S3method(sample_n,grouped_df)
+S3method(select,data.frame)
+S3method(select,default)
+S3method(select,grouped_df)
+S3method(select,tbl_cube)
S3method(select_,data.frame)
S3method(select_,grouped_df)
S3method(select_,tbl_cube)
-S3method(select_,tbl_lazy)
S3method(semi_join,data.frame)
S3method(semi_join,tbl_df)
-S3method(semi_join,tbl_lazy)
S3method(setdiff,data.frame)
S3method(setdiff,default)
-S3method(setdiff,tbl_lazy)
S3method(setequal,data.frame)
S3method(setequal,default)
+S3method(slice,data.frame)
+S3method(slice,default)
+S3method(slice,tbl_df)
S3method(slice_,data.frame)
S3method(slice_,tbl_df)
-S3method(sql_build,op_arrange)
-S3method(sql_build,op_base_local)
-S3method(sql_build,op_base_remote)
-S3method(sql_build,op_distinct)
-S3method(sql_build,op_filter)
-S3method(sql_build,op_group_by)
-S3method(sql_build,op_head)
-S3method(sql_build,op_join)
-S3method(sql_build,op_mutate)
-S3method(sql_build,op_rename)
-S3method(sql_build,op_select)
-S3method(sql_build,op_semi_join)
-S3method(sql_build,op_set_op)
-S3method(sql_build,op_summarise)
-S3method(sql_build,op_ungroup)
-S3method(sql_build,tbl_lazy)
-S3method(sql_build,tbl_sql)
-S3method(sql_escape_ident,DBITestConnection)
-S3method(sql_escape_ident,MySQLConnection)
-S3method(sql_escape_ident,SQLiteConnection)
-S3method(sql_escape_ident,default)
-S3method(sql_escape_string,default)
-S3method(sql_join,default)
-S3method(sql_render,ident)
-S3method(sql_render,join_query)
-S3method(sql_render,op)
-S3method(sql_render,select_query)
-S3method(sql_render,semi_join_query)
-S3method(sql_render,set_op_query)
-S3method(sql_render,sql)
-S3method(sql_render,tbl_lazy)
-S3method(sql_render,tbl_sql)
-S3method(sql_select,default)
-S3method(sql_semi_join,default)
-S3method(sql_set_op,default)
-S3method(sql_subquery,SQLiteConnection)
-S3method(sql_subquery,default)
-S3method(sql_translate_env,"NULL")
-S3method(sql_translate_env,DBITestConnection)
-S3method(sql_translate_env,MySQLConnection)
-S3method(sql_translate_env,PostgreSQLConnection)
-S3method(sql_translate_env,SQLiteConnection)
-S3method(src_desc,src_mysql)
-S3method(src_desc,src_postgres)
-S3method(src_desc,src_sqlite)
S3method(src_tbls,src_local)
-S3method(src_tbls,src_sql)
+S3method(summarise,data.frame)
+S3method(summarise,default)
+S3method(summarise,tbl_cube)
+S3method(summarise,tbl_df)
S3method(summarise_,data.frame)
S3method(summarise_,tbl_cube)
S3method(summarise_,tbl_df)
-S3method(summarise_,tbl_lazy)
-S3method(tail,tbl_sql)
+S3method(tbl,DBIConnection)
S3method(tbl,src_local)
-S3method(tbl,src_mysql)
-S3method(tbl,src_postgres)
-S3method(tbl,src_sqlite)
+S3method(tbl_sum,grouped_df)
S3method(tbl_vars,data.frame)
S3method(tbl_vars,tbl_cube)
-S3method(tbl_vars,tbl_lazy)
+S3method(transmute,default)
S3method(transmute_,default)
S3method(ungroup,data.frame)
S3method(ungroup,grouped_df)
S3method(ungroup,rowwise_df)
-S3method(ungroup,tbl_lazy)
S3method(union,data.frame)
S3method(union,default)
-S3method(union,tbl_lazy)
S3method(union_all,data.frame)
S3method(union_all,default)
-S3method(union_all,tbl_lazy)
-S3method(unique,sql)
export("%>%")
-export(add_op_single)
+export(add_count)
+export(add_count_)
export(add_row)
export(add_rownames)
+export(add_tally)
+export(add_tally_)
export(all_equal)
+export(all_vars)
export(anti_join)
+export(any_vars)
export(arrange)
export(arrange_)
+export(arrange_all)
+export(arrange_at)
+export(arrange_if)
export(as.tbl)
export(as.tbl_cube)
export(as_data_frame)
+export(as_tibble)
export(auto_copy)
-export(base_agg)
-export(base_no_win)
-export(base_scalar)
-export(base_win)
export(bench_tbls)
export(between)
export(bind_cols)
export(bind_rows)
-export(build_sql)
export(case_when)
export(changes)
+export(check_dbplyr)
export(coalesce)
export(collapse)
export(collect)
export(combine)
export(common_by)
export(compare_tbls)
+export(compare_tbls2)
export(compute)
export(contains)
-export(copy_lahman)
-export(copy_nycflights13)
export(copy_to)
export(count)
export(count_)
@@ -334,6 +225,7 @@ export(db_create_index)
export(db_create_indexes)
export(db_create_table)
export(db_data_type)
+export(db_desc)
export(db_drop_table)
export(db_explain)
export(db_has_table)
@@ -343,6 +235,7 @@ export(db_query_fields)
export(db_query_rows)
export(db_rollback)
export(db_save_query)
+export(db_write_table)
export(dense_rank)
export(desc)
export(dim_desc)
@@ -350,14 +243,19 @@ export(distinct)
export(distinct_)
export(do)
export(do_)
+export(dr_dplyr)
export(ends_with)
-export(escape)
+export(enquo)
export(eval_tbls)
+export(eval_tbls2)
export(everything)
export(explain)
export(failwith)
export(filter)
export(filter_)
+export(filter_all)
+export(filter_at)
+export(filter_if)
export(first)
export(frame_data)
export(full_join)
@@ -366,32 +264,26 @@ export(funs_)
export(glimpse)
export(group_by)
export(group_by_)
+export(group_by_all)
+export(group_by_at)
+export(group_by_if)
export(group_by_prepare)
export(group_indices)
export(group_indices_)
export(group_size)
+export(group_vars)
export(grouped_df)
export(groups)
-export(has_lahman)
-export(has_nycflights13)
export(id)
export(ident)
export(if_else)
export(inner_join)
export(intersect)
export(is.grouped_df)
-export(is.ident)
-export(is.sql)
export(is.src)
export(is.tbl)
-export(join_query)
+export(is_grouped_df)
export(lag)
-export(lahman_df)
-export(lahman_dt)
-export(lahman_mysql)
-export(lahman_postgres)
-export(lahman_sqlite)
-export(lahman_srcs)
export(last)
export(lead)
export(left_join)
@@ -400,7 +292,6 @@ export(lst)
export(lst_)
export(make_tbl)
export(matches)
-export(memdb_frame)
export(min_rank)
export(mutate)
export(mutate_)
@@ -408,38 +299,32 @@ export(mutate_all)
export(mutate_at)
export(mutate_each)
export(mutate_each_)
-export(mutate_each_q)
export(mutate_if)
export(n)
export(n_distinct)
export(n_groups)
export(na_if)
-export(named_commas)
export(near)
export(nth)
export(ntile)
export(num_range)
-export(nycflights13_postgres)
-export(nycflights13_sqlite)
export(one_of)
-export(op_base)
-export(op_double)
-export(op_grps)
-export(op_single)
-export(op_sort)
-export(op_vars)
export(order_by)
-export(partial_eval)
export(percent_rank)
export(progress_estimated)
-export(query)
+export(pull)
+export(quo)
+export(quo_name)
+export(quos)
export(rbind_all)
export(rbind_list)
export(recode)
export(recode_factor)
-export(regroup)
export(rename)
export(rename_)
+export(rename_all)
+export(rename_at)
+export(rename_if)
export(rename_vars)
export(rename_vars_)
export(right_join)
@@ -450,44 +335,32 @@ export(sample_frac)
export(sample_n)
export(select)
export(select_)
+export(select_all)
+export(select_at)
export(select_if)
-export(select_query)
+export(select_var)
export(select_vars)
export(select_vars_)
export(semi_join)
-export(semi_join_query)
-export(set_op_query)
export(setdiff)
export(setequal)
export(show_query)
export(slice)
export(slice_)
export(sql)
-export(sql_build)
export(sql_escape_ident)
export(sql_escape_string)
-export(sql_infix)
export(sql_join)
-export(sql_not_supported)
-export(sql_prefix)
-export(sql_quote)
-export(sql_render)
export(sql_select)
export(sql_semi_join)
export(sql_set_op)
export(sql_subquery)
export(sql_translate_env)
-export(sql_translator)
-export(sql_variant)
-export(sql_vector)
export(src)
-export(src_desc)
export(src_df)
export(src_local)
-export(src_memdb)
export(src_mysql)
export(src_postgres)
-export(src_sql)
export(src_sqlite)
export(src_tbls)
export(starts_with)
@@ -497,7 +370,6 @@ export(summarise_all)
export(summarise_at)
export(summarise_each)
export(summarise_each_)
-export(summarise_each_q)
export(summarise_if)
export(summarize)
export(summarize_)
@@ -507,20 +379,21 @@ export(summarize_each)
export(summarize_each_)
export(summarize_if)
export(tally)
+export(tally_)
export(tbl)
export(tbl_cube)
export(tbl_df)
-export(tbl_sql)
+export(tbl_nongroup_vars)
+export(tbl_sum)
export(tbl_vars)
-export(test_frame)
-export(test_load)
-export(test_register_src)
export(tibble)
export(top_n)
-export(translate_sql)
-export(translate_sql_)
export(transmute)
export(transmute_)
+export(transmute_all)
+export(transmute_at)
+export(transmute_if)
+export(tribble)
export(trunc_mat)
export(type_sum)
export(ungroup)
@@ -528,26 +401,36 @@ export(union)
export(union_all)
export(vars)
export(with_order)
-import(DBI)
-import(assertthat)
+export(wrap_dbplyr_obj)
+import(rlang)
importFrom(R6,R6Class)
importFrom(Rcpp,Rcpp.plugin.maker)
importFrom(Rcpp,cppFunction)
+importFrom(assertthat,"on_failure<-")
+importFrom(assertthat,assert_that)
+importFrom(assertthat,is.flag)
+importFrom(bindrcpp,create_env)
+importFrom(glue,glue)
importFrom(magrittr,"%>%")
+importFrom(methods,is)
+importFrom(pkgconfig,get_config)
importFrom(stats,lag)
importFrom(stats,setNames)
importFrom(stats,update)
importFrom(tibble,add_row)
importFrom(tibble,as_data_frame)
+importFrom(tibble,as_tibble)
importFrom(tibble,data_frame)
importFrom(tibble,data_frame_)
importFrom(tibble,frame_data)
importFrom(tibble,glimpse)
importFrom(tibble,lst)
importFrom(tibble,lst_)
+importFrom(tibble,tbl_sum)
importFrom(tibble,tibble)
+importFrom(tibble,tribble)
importFrom(tibble,trunc_mat)
importFrom(tibble,type_sum)
importFrom(utils,head)
importFrom(utils,tail)
-useDynLib(dplyr)
+useDynLib(dplyr, .registration = TRUE)
diff --git a/NEWS.md b/NEWS.md
index 5256455..9cbc5e0 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,438 @@
+# dplyr 0.7.4
+
+* Fix recent Fedora and ASAN check errors (#3098).
+
+* Avoid dependency on Rcpp 0.12.10 (#3106).
+
+# dplyr 0.7.3
+
+* Fixed protection error that occurred when creating a character column using grouped `mutate()` (#2971).
+
+* Fixed a rare problem with accessing variable values in `summarise()` when all groups have size one (#3050).
+
+* Fixed rare out-of-bounds memory write in `slice()` when negative indices beyond the number of rows were involved (#3073).
+
+* `select()`, `rename()` and `summarise()` no longer change the grouped vars of the original data (#3038).
+
+* `nth(default = var)`, `first(default = var)` and `last(default = var)` fall back to standard evaluation in a grouped operation instead of triggering an error (#3045).
+
+* `case_when()` now works if all LHS are atomic (#2909), or when LHS or RHS values are zero-length vectors (#3048).
+
+* `case_when()` accepts `NA` on the LHS (#2927).
+
+* Semi- and anti-joins now preserve the order of left-hand-side data frame (#3089).
+
+* Improved error message for invalid list arguments to `bind_rows()` (#3068).
+
+* Grouping by character vectors is now faster (#2204).
+
+* Fixed a crash that occurred when an unexpected input was supplied to
+ the `call` argument of `order_by()` (#3065).
+
+# dplyr 0.7.2
+
+* Move build-time vs. run-time checks out of `.onLoad()` and into `dr_dplyr()`.
+
+# dplyr 0.7.1
+
+* Use new versions of bindrcpp and glue to avoid protection problems.
+ Avoid wrapping arguments to internal error functions (#2877). Fix
+ two protection mistakes found by rchk (#2868).
+
+* Fix C++ error that caused compilation to fail on mac cran (#2862)
+
+* Fix undefined behaviour in `between()`, where `NA_REAL` were
+ assigned instead of `NA_LOGICAL`. (#2855, @zeehio)
+
+* `top_n()` now executes operations lazily for compatibility with
+ database backends (#2848).
+
+* Reuse of new variables created in ungrouped `mutate()` possible
+ again, regression introduced in dplyr 0.7.0 (#2869).
+
+* Quosured symbols do not prevent hybrid handling anymore. This should
+ fix many performance issues introduced with tidyeval (#2822).
+
+# dplyr 0.7.0
+
+## New data, functions, and features
+
+* Five new datasets provide some interesting built-in datasets to demonstrate
+ dplyr verbs (#2094):
+
+ * `starwars` dataset about starwars characters; has list columns
+ * `storms` has the trajectories of ~200 tropical storms
+ * `band_members`, `band_instruments` and `band_instruments2`
+ has some simple data to demonstrate joins.
+
+* New `add_count()` and `add_tally()` for adding an `n` column within groups
+ (#2078, @dgrtwo).
+
+* `arrange()` for grouped data frames gains a `.by_group` argument so you
+ can choose to sort by groups if you want to (defaults to `FALSE`) (#2318)
+
+* New `pull()` generic for extracting a single column either by name or position
+ (either from the left or the right). Thanks to @paulponcet for the idea (#2054).
+
+ This verb is powered with the new `select_var()` internal helper,
+ which is exported as well. It is like `select_vars()` but returns a
+ single variable.
+
+* `as_tibble()` is re-exported from tibble. This is the recommend way to create
+ tibbles from existing data frames. `tbl_df()` has been softly deprecated.
+ `tribble()` is now imported from tibble (#2336, @chrMongeau); this
+ is now prefered to `frame_data()`.
+
+## Deprecated and defunct
+
+* dplyr no longer messages that you need dtplyr to work with data.table (#2489).
+
+* Long deprecated `regroup()`, `mutate_each_q()` and
+ `summarise_each_q()` functions have been removed.
+
+* Deprecated `failwith()`. I'm not even sure why it was here.
+
+* Soft-deprecated `mutate_each()` and `summarise_each()`, these functions
+ print a message which will be changed to a warning in the next release.
+
+* The `.env` argument to `sample_n()` and `sample_frac()` is defunct,
+ passing a value to this argument print a message which will be changed to a
+ warning in the next release.
+
+## Databases
+
+This version of dplyr includes some major changes to how database connections work. By and large, you should be able to continue using your existing dplyr database code without modification, but there are two big changes that you should be aware of:
+
+* Almost all database related code has been moved out of dplyr and into a
+ new package, [dbplyr](http://github.com/hadley/dbplyr/). This makes dplyr
+ simpler, and will make it easier to release fixes for bugs that only affect
+ databases. `src_mysql()`, `src_postgres()`, and `src_sqlite()` will still
+ live dplyr so your existing code continues to work.
+
+* It is no longer necessary to create a remote "src". Instead you can work
+ directly with the database connection returned by DBI. This reflects the
+ maturity of the DBI ecosystem. Thanks largely to the work of Kirill Muller
+ (funded by the R Consortium) DBI backends are now much more consistent,
+ comprehensive, and easier to use. That means that there's no longer a
+ need for a layer in between you and DBI.
+
+You can continue to use `src_mysql()`, `src_postgres()`, and `src_sqlite()`, but I recommend a new style that makes the connection to DBI more clear:
+
+```R
+library(dplyr)
+
+con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
+DBI::dbWriteTable(con, "mtcars", mtcars)
+
+mtcars2 <- tbl(con, "mtcars")
+mtcars2
+```
+
+This is particularly useful if you want to perform non-SELECT queries as you can do whatever you want with `DBI::dbGetQuery()` and `DBI::dbExecute()`.
+
+If you've implemented a database backend for dplyr, please read the [backend news](https://github.com/hadley/dbplyr/blob/master/NEWS.md#backends) to see what's changed from your perspective (not much). If you want to ensure your package works with both the current and previous version of dplyr, see `wrap_dbplyr_obj()` for helpers.
+
+## UTF-8
+
+* Internally, column names are always represented as character vectors,
+ and not as language symbols, to avoid encoding problems on Windows
+ (#1950, #2387, #2388).
+
+* Error messages and explanations of data frame inequality are now encoded in
+ UTF-8, also on Windows (#2441).
+
+* Joins now always reencode character columns to UTF-8 if necessary. This gives
+ a nice speedup, because now pointer comparison can be used instead of string
+ comparison, but relies on a proper encoding tag for all strings (#2514).
+
+* Fixed problems when joining factor or character encodings with a mix of
+ native and UTF-8 encoded values (#1885, #2118, #2271, #2451).
+
+* Fix `group_by()` for data frames that have UTF-8 encoded names (#2284, #2382).
+
+* New `group_vars()` generic that returns the grouping as character vector, to
+ avoid the potentially lossy conversion to language symbols. The list returned
+ by `group_by_prepare()` now has a new `group_names` component (#1950, #2384).
+
+## Colwise functions
+
+* `rename()`, `select()`, `group_by()`, `filter()`, `arrange()` and
+ `transmute()` now have scoped variants (verbs suffixed with `_if()`,
+ `_at()` and `_all()`). Like `mutate_all()`, `summarise_if()`, etc,
+ these variants apply an operation to a selection of variables.
+
+* The scoped verbs taking predicates (`mutate_if()`, `summarise_if()`,
+ etc) now support S3 objects and lazy tables. S3 objects should
+ implement methods for `length()`, `[[` and `tbl_vars()`. For lazy
+ tables, the first 100 rows are collected and the predicate is
+ applied on this subset of the data. This is robust for the common
+ case of checking the type of a column (#2129).
+
+* Summarise and mutate colwise functions pass `...` on the the manipulation
+ functions.
+
+* The performance of colwise verbs like `mutate_all()` is now back to
+ where it was in `mutate_each()`.
+
+* `funs()` has better handling of namespaced functions (#2089).
+
+* Fix issue with `mutate_if()` and `summarise_if()` when a predicate
+ function returns a vector of `FALSE` (#1989, #2009, #2011).
+
+## Tidyeval
+
+dplyr has a new approach to non-standard evaluation (NSE) called tidyeval.
+It is described in detail in `vignette("programming")` but, in brief, gives you
+the ability to interpolate values in contexts where dplyr usually works with expressions:
+
+```{r}
+my_var <- quo(homeworld)
+
+starwars %>%
+ group_by(!!my_var) %>%
+ summarise_at(vars(height:mass), mean, na.rm = TRUE)
+```
+
+This means that the underscored version of each main verb is no longer needed,
+and so these functions have been deprecated (but remain around for backward compatibility).
+
+* `order_by()`, `top_n()`, `sample_n()` and `sample_frac()` now use
+ tidyeval to capture their arguments by expression. This makes it
+ possible to use unquoting idioms (see `vignette("programming")`) and
+ fixes scoping issues (#2297).
+
+* Most verbs taking dots now ignore the last argument if empty. This
+ makes it easier to copy lines of code without having to worry about
+ deleting trailing commas (#1039).
+
+* [API] The new `.data` and `.env` environments can be used inside
+ all verbs that operate on data: `.data$column_name` accesses the column
+ `column_name`, whereas `.env$var` accesses the external variable `var`.
+ Columns or external variables named `.data` or `.env` are shadowed, use
+ `.data$...` and/or `.env$...` to access them. (`.data` implements strict
+ matching also for the `$` operator (#2591).)
+
+ The `column()` and `global()` functions have been removed. They were never
+ documented officially. Use the new `.data` and `.env` environments instead.
+
+* Expressions in verbs are now interpreted correctly in many cases that
+ failed before (e.g., use of `$`, `case_when()`, nonstandard evaluation, ...).
+ These expressions are now evaluated in a specially constructed temporary
+ environment that retrieves column data on demand with the help of the
+ `bindrcpp` package (#2190). This temporary environment poses restrictions on
+ assignments using `<-` inside verbs. To prevent leaking of broken bindings,
+ the temporary environment is cleared after the evaluation (#2435).
+
+## Verbs
+
+### Joins
+
+* [API] `xxx_join.tbl_df(na_matches = "never")` treats all `NA` values as
+ different from each other (and from any other value), so that they never
+ match. This corresponds to the behavior of joins for database sources,
+ and of database joins in general. To match `NA` values, pass
+ `na_matches = "na"` to the join verbs; this is only supported for data frames.
+ The default is `na_matches = "na"`, kept for the sake of compatibility
+ to v0.5.0. It can be tweaked by calling
+ `pkgconfig::set_config("dplyr::na_matches", "na")` (#2033).
+
+* `common_by()` gets a better error message for unexpected inputs (#2091)
+
+* Fix groups when joining grouped data frames with duplicate columns
+ (#2330, #2334, @davidkretch).
+
+* One of the two join suffixes can now be an empty string, dplyr no longer
+ hangs (#2228, #2445).
+
+* Anti- and semi-joins warn if factor levels are inconsistent (#2741).
+
+* Warnings about join column inconsistencies now contain the column names
+ (#2728).
+
+### Select
+
+* For selecting variables, the first selector decides if it's an inclusive
+ selection (i.e., the initial column list is empty), or an exclusive selection
+ (i.e., the initial column list contains all columns). This means that
+ `select(mtcars, contains("am"), contains("FOO"), contains("vs"))` now returns
+ again both `am` and `vs` columns like in dplyr 0.4.3 (#2275, #2289, @r2evans).
+
+* Select helpers now throw an error if called when no variables have been
+ set (#2452)
+
+* Helper functions in `select()` (and related verbs) are now evaluated
+ in a context where column names do not exist (#2184).
+
+* `select()` (and the internal function `select_vars()`) now support
+ column names in addition to column positions. As a result,
+ expressions like `select(mtcars, "cyl")` are now allowed.
+
+### Other
+
+* `recode()`, `case_when()` and `coalesce()` now support splicing of
+ arguments with rlang's `!!!` operator.
+
+* `count()` now preserves the grouping of its input (#2021).
+
+* `distinct()` no longer duplicates variables (#2001).
+
+* Empty `distinct()` with a grouped data frame works the same way as
+ an empty `distinct()` on an ungrouped data frame, namely it uses all
+ variables (#2476).
+
+* `copy_to()` now returns it's output invisibly (since you're often just
+ calling for the side-effect).
+
+* `filter()` and `lag()` throw informative error if used with ts objects (#2219)
+
+* `mutate()` recycles list columns of length 1 (#2171).
+
+* `mutate()` gives better error message when attempting to add a non-vector
+ column (#2319), or attempting to remove a column with `NULL` (#2187, #2439).
+
+* `summarise()` now correctly evaluates newly created factors (#2217), and
+ can create ordered factors (#2200).
+
+* Ungrouped `summarise()` uses summary variables correctly (#2404, #2453).
+
+* Grouped `summarise()` no longer converts character `NA` to empty strings (#1839).
+
+## Combining and comparing
+
+* `all_equal()` now reports multiple problems as a character vector (#1819, #2442).
+
+* `all_equal()` checks that factor levels are equal (#2440, #2442).
+
+* `bind_rows()` and `bind_cols()` give an error for database tables (#2373).
+
+* `bind_rows()` works correctly with `NULL` arguments and an `.id` argument
+ (#2056), and also for zero-column data frames (#2175).
+
+* Breaking change: `bind_rows()` and `combine()` are more strict when coercing.
+ Logical values are no longer coerced to integer and numeric. Date, POSIXct
+ and other integer or double-based classes are no longer coerced to integer or
+ double as there is chance of attributes or information being lost
+ (#2209, @zeehio).
+
+* `bind_cols()` now calls `tibble::repair_names()` to ensure that all
+ names are unique (#2248).
+
+* `bind_cols()` handles empty argument list (#2048).
+
+* `bind_cols()` better handles `NULL` inputs (#2303, #2443).
+
+* `bind_rows()` explicitly rejects columns containing data frames
+ (#2015, #2446).
+
+* `bind_rows()` and `bind_cols()` now accept vectors. They are treated
+ as rows by the former and columns by the latter. Rows require inner
+ names like `c(col1 = 1, col2 = 2)`, while columns require outer
+ names: `col1 = c(1, 2)`. Lists are still treated as data frames but
+ can be spliced explicitly with `!!!`, e.g. `bind_rows(!!! x)` (#1676).
+
+* `rbind_list()` and `rbind_all()` now call `.Deprecated()`, they will be removed
+ in the next CRAN release. Please use `bind_rows()` instead.
+
+* `combine()` accepts `NA` values (#2203, @zeehio)
+
+* `combine()` and `bind_rows()` with character and factor types now always warn
+ about the coercion to character (#2317, @zeehio)
+
+* `combine()` and `bind_rows()` accept `difftime` objects.
+
+* `mutate` coerces results from grouped dataframes accepting combinable data
+ types (such as `integer` and `numeric`). (#1892, @zeehio)
+
+## Vector functions
+
+* `%in%` gets new hybrid handler (#126).
+
+* `between()` returns NA if `left` or `right` is `NA` (fixes #2562).
+
+* `case_when()` supports `NA` values (#2000, @tjmahr).
+
+* `first()`, `last()`, and `nth()` have better default values for factor,
+ Dates, POSIXct, and data frame inputs (#2029).
+
+* Fixed segmentation faults in hybrid evaluation of `first()`, `last()`,
+ `nth()`, `lead()`, and `lag()`. These functions now always fall back to the R
+ implementation if called with arguments that the hybrid evaluator cannot
+ handle (#948, #1980).
+
+* `n_distinct()` gets larger hash tables given slightly better performance (#977).
+
+* `nth()` and `ntile()` are more careful about proper data types of their return values (#2306).
+
+* `ntile()` ignores `NA` when computing group membership (#2564).
+
+* `lag()` enforces integer `n` (#2162, @kevinushey).
+
+* hybrid `min()` and `max()` now always return a `numeric` and work correctly
+ in edge cases (empty input, all `NA`, ...) (#2305, #2436).
+
+* `min_rank("string")` no longer segfaults in hybrid evaluation (#2279, #2444).
+
+* `recode()` can now recode a factor to other types (#2268)
+
+* `recode()` gains `.dots` argument to support passing replacements as list
+ (#2110, @jlegewie).
+
+## Other minor changes and bug fixes
+
+* Many error messages are more helpful by referring to a column name or a
+ position in the argument list (#2448).
+
+* New `is_grouped_df()` alias to `is.grouped_df()`.
+
+* `tbl_vars()` now has a `group_vars` argument set to `TRUE` by
+ default. If `FALSE`, group variables are not returned.
+
+* Fixed segmentation fault after calling `rename()` on an invalid grouped
+ data frame (#2031).
+
+* `rename_vars()` gains a `strict` argument to control if an
+ error is thrown when you try and rename a variable that doesn't
+ exist.
+
+* Fixed undefined behavior for `slice()` on a zero-column data frame (#2490).
+
+* Fixed very rare case of false match during join (#2515).
+
+* Restricted workaround for `match()` to R 3.3.0. (#1858).
+
+* dplyr now warns on load when the version of R or Rcpp during installation is
+ different to the currently installed version (#2514).
+
+* Fixed improper reuse of attributes when creating a list column in `summarise()`
+ and perhaps `mutate()` (#2231).
+
+* `mutate()` and `summarise()` always strip the `names` attribute from new
+ or updated columns, even for ungrouped operations (#1689).
+
+* Fixed rare error that could lead to a segmentation fault in
+ `all_equal(ignore_col_order = FALSE)` (#2502).
+
+* The "dim" and "dimnames" attributes are always stripped when copying a
+ vector (#1918, #2049).
+
+* `grouped_df` and `rowwise` are registered officially as S3 classes.
+ This makes them easier to use with S4 (#2276, @joranE, #2789).
+
+* All operations that return tibbles now include the `"tbl"` class.
+ This is important for correct printing with tibble 1.3.1 (#2789).
+
+* Makeflags uses PKG_CPPFLAGS for defining preprocessor macros.
+
+* astyle formatting for C++ code, tested but not changed as part of the tests
+ (#2086, #2103).
+
+* Update RStudio project settings to install tests (#1952).
+
+* Using `Rcpp::interfaces()` to register C callable interfaces, and registering all native exported functions via `R_registerRoutines()` and `useDynLib(.registration = TRUE)` (#2146).
+
+* Formatting of grouped data frames now works by overriding the `tbl_sum()` generic instead of `print()`. This means that the output is more consistent with tibble, and that `format()` is now supported also for SQL sources (#2781).
+
+
# dplyr 0.5.0
## Breaking changes
@@ -81,7 +516,7 @@ All data table related code has been separated out in to a new dtplyr package. T
### Tibble
-Functions to related to the creation and coercion of `tbl_df`s, now live in their own package: [tibble](http://blog.rstudio.org/2016/03/24/tibble-1-0-0/). See `vignette("tibble")` for more details.
+Functions related to the creation and coercion of `tbl_df`s, now live in their own package: [tibble](http://blog.rstudio.org/2016/03/24/tibble-1-0-0/). See `vignette("tibble")` for more details.
* `$` and `[[` methods that never do partial matching (#1504), and throw
an error if the variable does not exist.
@@ -317,9 +752,13 @@ There were two other tweaks to the exported API, but these are less likely to af
that is empty (#1496), or has duplicates (#1192). Suffixes grow progressively
to avoid creating repeated column names (#1460). Joins on string columns
should be substantially faster (#1386). Extra attributes are ok if they are
- identical (#1636). Joins work correct when factor levels not equal
- (#1712, #1559), and anti and semi joins give correct result when by variable is a
- factor (#1571).
+ identical (#1636). Joins work correct when factor levels not equal
+ (#1712, #1559). Anti- and semi-joins give correct result when by variable
+ is a factor (#1571), but warn if factor levels are inconsistent (#2741).
+ A clear error message is given for joins where an
+ explicit `by` contains unavailable columns (#1928, #1932).
+ Warnings about join column inconsistencies now contain the column names
+ (#2728).
* `inner_join()`, `left_join()`, `right_join()`, and `full_join()` gain a
`suffix` argument which allows you to control what suffix duplicated variable
@@ -659,16 +1098,18 @@ This is a minor release containing fixes for a number of crashes and issues iden
* `grouped_df()` requires `vars` to be a list of symbols (#665).
-* `min(.,na.rm = TRUE)` works with `Date`s built on numeric vectors (#755)
+* `min(.,na.rm = TRUE)` works with `Date`s built on numeric vectors (#755).
* `rename_()` generic gets missing `.dots` argument (#708).
* `row_number()`, `min_rank()`, `percent_rank()`, `dense_rank()`, `ntile()` and
`cume_dist()` handle data frames with 0 rows (#762). They all preserve
missing values (#774). `row_number()` doesn't segfault when giving an external
- variable with the wrong number of variables (#781)
+ variable with the wrong number of variables (#781).
+
+* `group_indices` handles the edge case when there are no variables (#867).
-* `group_indices` handles the edge case when there are no variables (#867)
+* Removed bogus `NAs introduced by coercion to integer range` on 32-bit Windows (#2708).
# dplyr 0.3.0.1
diff --git a/R/RcppExports.R b/R/RcppExports.R
index 52318c5..94020b3 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -1,29 +1,46 @@
-# This file was generated by Rcpp::compileAttributes
+# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
loc <- function(data) {
- .Call('dplyr_loc', PACKAGE = 'dplyr', data)
+ .Call(`_dplyr_loc`, data)
}
dfloc <- function(df) {
- .Call('dplyr_dfloc', PACKAGE = 'dplyr', df)
+ .Call(`_dplyr_dfloc`, df)
}
plfloc <- function(data) {
- .Call('dplyr_plfloc', PACKAGE = 'dplyr', data)
+ .Call(`_dplyr_plfloc`, data)
}
-rank_strings <- function(s) {
- .Call('dplyr_rank_strings', PACKAGE = 'dplyr', s)
+strings_addresses <- function(s) {
+ .Call(`_dplyr_strings_addresses`, s)
+}
+
+gp <- function(x) {
+ .Call(`_dplyr_gp`, x)
+}
+
+#' Enable internal logging
+#'
+#' Log entries, depending on the log level, will be printed to the standard
+#' error stream.
+#'
+#' @param log_level A character value, one of "WARN", "INFO", "DEBUG", "VERB",
+#' or "NONE".
+#'
+#' @keywords internal
+init_logging <- function(log_level) {
+ invisible(.Call(`_dplyr_init_logging`, log_level))
}
-arrange_impl <- function(data, dots) {
- .Call('dplyr_arrange_impl', PACKAGE = 'dplyr', data, dots)
+arrange_impl <- function(data, quosures) {
+ .Call(`_dplyr_arrange_impl`, data, quosures)
}
#' Do values in a numeric vector fall in specified range?
#'
-#' This is a shortcut for \code{x >= left & x <= right}, implemented
+#' This is a shortcut for `x >= left & x <= right`, implemented
#' efficiently in C++ for local values, and translated to the
#' appropriate SQL for remote tables.
#'
@@ -34,199 +51,175 @@ arrange_impl <- function(data, dots) {
#' x <- rnorm(1e2)
#' x[between(x, -1, 1)]
between <- function(x, left, right) {
- .Call('dplyr_between', PACKAGE = 'dplyr', x, left, right)
+ .Call(`_dplyr_between`, x, left, right)
}
-bind_rows_ <- function(dots, id = NULL) {
- .Call('dplyr_bind_rows_', PACKAGE = 'dplyr', dots, id)
+flatten_bindable <- function(x) {
+ .Call(`_dplyr_flatten_bindable`, x)
}
-rbind_list__impl <- function(dots) {
- .Call('dplyr_rbind_list__impl', PACKAGE = 'dplyr', dots)
+bind_rows_ <- function(dots, id) {
+ .Call(`_dplyr_bind_rows_`, dots, id)
}
cbind_all <- function(dots) {
- .Call('dplyr_cbind_all', PACKAGE = 'dplyr', dots)
+ .Call(`_dplyr_cbind_all`, dots)
}
combine_all <- function(data) {
- .Call('dplyr_combine_all', PACKAGE = 'dplyr', data)
+ .Call(`_dplyr_combine_all`, data)
}
combine_vars <- function(vars, xs) {
- .Call('dplyr_combine_vars', PACKAGE = 'dplyr', vars, xs)
+ .Call(`_dplyr_combine_vars`, vars, xs)
}
distinct_impl <- function(df, vars, keep) {
- .Call('dplyr_distinct_impl', PACKAGE = 'dplyr', df, vars, keep)
-}
-
-assert_all_white_list <- function(data) {
- invisible(.Call('dplyr_assert_all_white_list', PACKAGE = 'dplyr', data))
-}
-
-semi_join_impl <- function(x, y, by_x, by_y) {
- .Call('dplyr_semi_join_impl', PACKAGE = 'dplyr', x, y, by_x, by_y)
-}
-
-anti_join_impl <- function(x, y, by_x, by_y) {
- .Call('dplyr_anti_join_impl', PACKAGE = 'dplyr', x, y, by_x, by_y)
+ .Call(`_dplyr_distinct_impl`, df, vars, keep)
}
-inner_join_impl <- function(x, y, by_x, by_y, suffix_x, suffix_y) {
- .Call('dplyr_inner_join_impl', PACKAGE = 'dplyr', x, y, by_x, by_y, suffix_x, suffix_y)
-}
-
-left_join_impl <- function(x, y, by_x, by_y, suffix_x, suffix_y) {
- .Call('dplyr_left_join_impl', PACKAGE = 'dplyr', x, y, by_x, by_y, suffix_x, suffix_y)
-}
-
-right_join_impl <- function(x, y, by_x, by_y, suffix_x, suffix_y) {
- .Call('dplyr_right_join_impl', PACKAGE = 'dplyr', x, y, by_x, by_y, suffix_x, suffix_y)
-}
-
-full_join_impl <- function(x, y, by_x, by_y, suffix_x, suffix_y) {
- .Call('dplyr_full_join_impl', PACKAGE = 'dplyr', x, y, by_x, by_y, suffix_x, suffix_y)
+n_distinct_multi <- function(variables, na_rm = FALSE) {
+ .Call(`_dplyr_n_distinct_multi`, variables, na_rm)
}
-shallow_copy <- function(data) {
- .Call('dplyr_shallow_copy', PACKAGE = 'dplyr', data)
+filter_impl <- function(df, quo) {
+ .Call(`_dplyr_filter_impl`, df, quo)
}
-compatible_data_frame_nonames <- function(x, y, convert) {
- .Call('dplyr_compatible_data_frame_nonames', PACKAGE = 'dplyr', x, y, convert)
+grouped_df_impl <- function(data, symbols, drop) {
+ .Call(`_dplyr_grouped_df_impl`, data, symbols, drop)
}
-compatible_data_frame <- function(x, y, ignore_col_order = TRUE, convert = FALSE) {
- .Call('dplyr_compatible_data_frame', PACKAGE = 'dplyr', x, y, ignore_col_order, convert)
+as_regular_df <- function(df) {
+ .Call(`_dplyr_as_regular_df`, df)
}
-equal_data_frame <- function(x, y, ignore_col_order = TRUE, ignore_row_order = TRUE, convert = FALSE) {
- .Call('dplyr_equal_data_frame', PACKAGE = 'dplyr', x, y, ignore_col_order, ignore_row_order, convert)
+ungroup_grouped_df <- function(df) {
+ .Call(`_dplyr_ungroup_grouped_df`, df)
}
-all_equal_data_frame <- function(args, env) {
- .Call('dplyr_all_equal_data_frame', PACKAGE = 'dplyr', args, env)
+test_grouped_df <- function(data) {
+ .Call(`_dplyr_test_grouped_df`, data)
}
-union_data_frame <- function(x, y) {
- .Call('dplyr_union_data_frame', PACKAGE = 'dplyr', x, y)
+grouped_indices_grouped_df_impl <- function(gdf) {
+ .Call(`_dplyr_grouped_indices_grouped_df_impl`, gdf)
}
-intersect_data_frame <- function(x, y) {
- .Call('dplyr_intersect_data_frame', PACKAGE = 'dplyr', x, y)
+group_size_grouped_cpp <- function(gdf) {
+ .Call(`_dplyr_group_size_grouped_cpp`, gdf)
}
-setdiff_data_frame <- function(x, y) {
- .Call('dplyr_setdiff_data_frame', PACKAGE = 'dplyr', x, y)
+semi_join_impl <- function(x, y, by_x, by_y, na_match) {
+ .Call(`_dplyr_semi_join_impl`, x, y, by_x, by_y, na_match)
}
-match_data_frame <- function(x, y) {
- .Call('dplyr_match_data_frame', PACKAGE = 'dplyr', x, y)
+anti_join_impl <- function(x, y, by_x, by_y, na_match) {
+ .Call(`_dplyr_anti_join_impl`, x, y, by_x, by_y, na_match)
}
-resolve_vars <- function(new_groups, names) {
- .Call('dplyr_resolve_vars', PACKAGE = 'dplyr', new_groups, names)
+inner_join_impl <- function(x, y, by_x, by_y, suffix_x, suffix_y, na_match) {
+ .Call(`_dplyr_inner_join_impl`, x, y, by_x, by_y, suffix_x, suffix_y, na_match)
}
-grouped_df_impl <- function(data, symbols, drop) {
- .Call('dplyr_grouped_df_impl', PACKAGE = 'dplyr', data, symbols, drop)
+left_join_impl <- function(x, y, by_x, by_y, suffix_x, suffix_y, na_match) {
+ .Call(`_dplyr_left_join_impl`, x, y, by_x, by_y, suffix_x, suffix_y, na_match)
}
-grouped_df_adj_impl <- function(data, symbols, drop) {
- .Call('dplyr_grouped_df_adj_impl', PACKAGE = 'dplyr', data, symbols, drop)
+right_join_impl <- function(x, y, by_x, by_y, suffix_x, suffix_y, na_match) {
+ .Call(`_dplyr_right_join_impl`, x, y, by_x, by_y, suffix_x, suffix_y, na_match)
}
-slice_impl <- function(df, dots) {
- .Call('dplyr_slice_impl', PACKAGE = 'dplyr', df, dots)
+full_join_impl <- function(x, y, by_x, by_y, suffix_x, suffix_y, na_match) {
+ .Call(`_dplyr_full_join_impl`, x, y, by_x, by_y, suffix_x, suffix_y, na_match)
}
mutate_impl <- function(df, dots) {
- .Call('dplyr_mutate_impl', PACKAGE = 'dplyr', df, dots)
-}
-
-order_impl <- function(args, env) {
- .Call('dplyr_order_impl', PACKAGE = 'dplyr', args, env)
+ .Call(`_dplyr_mutate_impl`, df, dots)
}
-sort_impl <- function(data) {
- .Call('dplyr_sort_impl', PACKAGE = 'dplyr', data)
+select_impl <- function(df, vars) {
+ .Call(`_dplyr_select_impl`, df, vars)
}
-group_size_grouped_cpp <- function(gdf) {
- .Call('dplyr_group_size_grouped_cpp', PACKAGE = 'dplyr', gdf)
+compatible_data_frame_nonames <- function(x, y, convert) {
+ .Call(`_dplyr_compatible_data_frame_nonames`, x, y, convert)
}
-n_distinct_multi <- function(variables, na_rm = FALSE) {
- .Call('dplyr_n_distinct_multi', PACKAGE = 'dplyr', variables, na_rm)
+compatible_data_frame <- function(x, y, ignore_col_order = TRUE, convert = FALSE) {
+ .Call(`_dplyr_compatible_data_frame`, x, y, ignore_col_order, convert)
}
-as_regular_df <- function(df) {
- .Call('dplyr_as_regular_df', PACKAGE = 'dplyr', df)
+equal_data_frame <- function(x, y, ignore_col_order = TRUE, ignore_row_order = TRUE, convert = FALSE) {
+ .Call(`_dplyr_equal_data_frame`, x, y, ignore_col_order, ignore_row_order, convert)
}
-ungroup_grouped_df <- function(df) {
- .Call('dplyr_ungroup_grouped_df', PACKAGE = 'dplyr', df)
+union_data_frame <- function(x, y) {
+ .Call(`_dplyr_union_data_frame`, x, y)
}
-split_indices <- function(group, groups) {
- .Call('dplyr_split_indices', PACKAGE = 'dplyr', group, groups)
+intersect_data_frame <- function(x, y) {
+ .Call(`_dplyr_intersect_data_frame`, x, y)
}
-gp <- function(x) {
- .Call('dplyr_gp', PACKAGE = 'dplyr', x)
+setdiff_data_frame <- function(x, y) {
+ .Call(`_dplyr_setdiff_data_frame`, x, y)
}
-filter_impl <- function(df, dots) {
- .Call('dplyr_filter_impl', PACKAGE = 'dplyr', df, dots)
+slice_impl <- function(df, dots) {
+ .Call(`_dplyr_slice_impl`, df, dots)
}
-grouped_indices_grouped_df_impl <- function(gdf) {
- .Call('dplyr_grouped_indices_grouped_df_impl', PACKAGE = 'dplyr', gdf)
+summarise_impl <- function(df, dots) {
+ .Call(`_dplyr_summarise_impl`, df, dots)
}
-grouped_indices_impl <- function(data, symbols) {
- .Call('dplyr_grouped_indices_impl', PACKAGE = 'dplyr', data, symbols)
+test_comparisons <- function() {
+ .Call(`_dplyr_test_comparisons`)
}
-select_impl <- function(df, vars) {
- .Call('dplyr_select_impl', PACKAGE = 'dplyr', df, vars)
+test_matches <- function() {
+ .Call(`_dplyr_test_matches`)
}
-strings_addresses <- function(s) {
- .Call('dplyr_strings_addresses', PACKAGE = 'dplyr', s)
+test_length_wrap <- function() {
+ .Call(`_dplyr_test_length_wrap`)
}
-summarise_impl <- function(df, dots) {
- .Call('dplyr_summarise_impl', PACKAGE = 'dplyr', df, dots)
+assert_all_white_list <- function(data) {
+ invisible(.Call(`_dplyr_assert_all_white_list`, data))
}
-test_comparisons <- function() {
- .Call('dplyr_test_comparisons', PACKAGE = 'dplyr')
+shallow_copy <- function(data) {
+ .Call(`_dplyr_shallow_copy`, data)
}
#' Cumulativate versions of any, all, and mean
#'
-#' dplyr adds \code{cumall}, \code{cumany}, and \code{cummean} to complete
+#' dplyr adds `cumall()`, `cumany()`, and `cummean()` to complete
#' R's set of cumulate functions to match the aggregation functions available
#' in most databases
#'
-#' @param x For \code{cumall} & \code{cumany}, a logical vector; for
-#' \code{cummean} an integer or numeric vector
+#' @param x For `cumall()` and `cumany()`, a logical vector; for
+#' `cummean()` an integer or numeric vector
#' @export
cumall <- function(x) {
- .Call('dplyr_cumall', PACKAGE = 'dplyr', x)
+ .Call(`_dplyr_cumall`, x)
}
#' @export
#' @rdname cumall
cumany <- function(x) {
- .Call('dplyr_cumany', PACKAGE = 'dplyr', x)
+ .Call(`_dplyr_cumany`, x)
}
#' @export
#' @rdname cumall
cummean <- function(x) {
- .Call('dplyr_cummean', PACKAGE = 'dplyr', x)
+ .Call(`_dplyr_cummean`, x)
}
+# Register entry points for exported C++ functions
+methods::setLoadAction(function(ns) {
+ .Call('_dplyr_RcppExport_registerCCallable', PACKAGE = 'dplyr')
+})
diff --git a/R/all-equal.r b/R/all-equal.r
index 0ea9373..a530cda 100644
--- a/R/all-equal.r
+++ b/R/all-equal.r
@@ -1,17 +1,17 @@
-#' Flexible equality comparison for data frames.
+#' Flexible equality comparison for data frames
#'
-#' You can use \code{all_equal} with any data frame, and dplyr also provides
-#' \code{tbl_df} methods for \code{\link{all.equal}}.
+#' You can use `all_equal()` with any data frame, and dplyr also provides
+#' `tbl_df` methods for [all.equal()].
#'
#' @param target,current Two data frames to compare.
#' @param ignore_col_order Should order of columns be ignored?
#' @param ignore_row_order Should order of rows be ignored?
#' @param convert Should similar classes be converted? Currently this will
#' convert factor to character and integer to double.
-#' @param ... Ignored. Needed for compatibility with \code{all.equal}.
-#' @return \code{TRUE} if equal, otherwise a character vector describing
-#' the reasons why they're not equal. Use \code{\link{isTRUE}} if using the
-#' result in an \code{if} expression.
+#' @param ... Ignored. Needed for compatibility with `all.equal()`.
+#' @return `TRUE` if equal, otherwise a character vector describing
+#' the reasons why they're not equal. Use [isTRUE()] if using the
+#' result in an `if` expression.
#' @export
#' @examples
#' scramble <- function(x) x[sample(nrow(x)), sample(ncol(x))]
diff --git a/R/bench-compare.r b/R/bench-compare.r
index 8539fb8..7c4d77c 100644
--- a/R/bench-compare.r
+++ b/R/bench-compare.r
@@ -3,29 +3,29 @@
#' These functions support the comparison of results and timings across
#' multiple sources.
#'
-#' @param tbls A list of \code{\link{tbl}}s.
+#' @param tbls,tbls_x,tbls_y A list of [tbl()]s.
#' @param op A function with a single argument, called often with each
-#' element of \code{tbls}.
+#' element of `tbls`.
#' @param ref For checking, an data frame to test results against. If not
-#' supplied, defaults to the results from the first \code{src}.
+#' supplied, defaults to the results from the first `src`.
#' @param compare A function used to compare the results. Defaults to
-#' \code{equal_data_frame} which ignores the order of rows and columns.
+#' `equal_data_frame` which ignores the order of rows and columns.
#' @param times For benchmarking, the number of times each operation is
#' repeated.
#' @param \dots
-#' For \code{compare_tbls}: additional parameters passed on the
-#' \code{compare} function
+#' For `compare_tbls()`: additional parameters passed on the
+#' `compare()` function
#'
-#' For \code{bench_tbls}: additional benchmarks to run.
+#' For `bench_tbls()`: additional benchmarks to run.
#' @return
-#' \code{eval_tbls}: a list of data frames.
+#' `eval_tbls()`: a list of data frames.
#'
-#' \code{compare_tbls}: an invisible \code{TRUE} on success, otherwise
+#' `compare_tbls()`: an invisible `TRUE` on success, otherwise
#' an error is thrown.
#'
-#' \code{bench_tbls}: an object of class
-#' \code{\link[microbenchmark]{microbenchmark}}
-#' @seealso \code{\link{src_local}} for working with local data
+#' `bench_tbls()`: an object of class
+#' [microbenchmark::microbenchmark()]
+#' @seealso [src_local()] for working with local data
#' @examples
#' \dontrun{
#' if (require("microbenchmark") && has_lahman()) {
@@ -58,14 +58,13 @@
#' }
#' }
#' @name bench_compare
+#' @keywords internal
NULL
#' @export
#' @rdname bench_compare
bench_tbls <- function(tbls, op, ..., times = 10) {
- if (!requireNamespace("microbenchmark")) {
- stop("Please install the microbenchmark package", call. = FALSE)
- }
+ check_pkg("microbenchmark", "compute table benchmarks")
# Generate call to microbenchmark function that evaluates op for each tbl
calls <- lapply(seq_along(tbls), function(i) {
@@ -73,22 +72,34 @@ bench_tbls <- function(tbls, op, ..., times = 10) {
})
names(calls) <- names(tbls)
- mb <- as.call(c(quote(microbenchmark::microbenchmark), calls, dots(...),
- list(times = times)))
+ mb <- as.call(c(
+ quote(microbenchmark::microbenchmark), calls, dots(...),
+ list(times = times)
+ ))
eval(mb)
}
#' @export
#' @rdname bench_compare
compare_tbls <- function(tbls, op, ref = NULL, compare = equal_data_frame, ...) {
- if (length(tbls) < 2 && is.null(ref)) {
+ results <- eval_tbls(tbls, op)
+ expect_equal_tbls(results, compare = compare, ...)
+}
+
+#' @export
+#' @rdname bench_compare
+compare_tbls2 <- function(tbls_x, tbls_y, op, ref = NULL, compare = equal_data_frame, ...) {
+ results <- eval_tbls2(tbls_x, tbls_y, op)
+ expect_equal_tbls(results, compare = compare, ...)
+}
+
+expect_equal_tbls <- function(results, ref = NULL, compare = equal_data_frame, ...) {
+ check_pkg("testthat", "compare tables")
+
+ if (length(results) < 2 && is.null(ref)) {
testthat::skip("Need at least two srcs to compare")
}
- if (!requireNamespace("testthat", quietly = TRUE)) {
- stop("Please install the testthat package", call. = FALSE)
- }
- results <- eval_tbls(tbls, op)
if (is.null(ref)) {
ref <- results[[1]]
@@ -99,21 +110,27 @@ compare_tbls <- function(tbls, op, ref = NULL, compare = equal_data_frame, ...)
ref_name <- "supplied comparison"
}
- for(i in seq_along(rest)) {
+ for (i in seq_along(rest)) {
ok <- compare(ref, rest[[i]], ...)
# if (!ok) browser()
- msg <- paste0(names(rest)[[i]], " not equal to ", ref_name, "\n",
- attr(ok, "comment"))
+ msg <- paste0(
+ names(rest)[[i]], " not equal to ", ref_name, "\n",
+ attr(ok, "comment")
+ )
testthat::expect_true(ok, info = msg)
}
invisible(TRUE)
}
-
#' @export
#' @rdname bench_compare
eval_tbls <- function(tbls, op) {
lapply(tbls, function(x) as.data.frame(op(x)))
}
+#' @export
+#' @rdname bench_compare
+eval_tbls2 <- function(tbls_x, tbls_y, op) {
+ Map(function(x, y) as.data.frame(op(x, y)), tbls_x, tbls_y)
+}
diff --git a/R/bind.r b/R/bind.r
index ea7a613..797b450 100644
--- a/R/bind.r
+++ b/R/bind.r
@@ -1,43 +1,80 @@
-#' Efficiently bind multiple data frames by row and column.
+#' Efficiently bind multiple data frames by row and column
#'
#' This is an efficient implementation of the common pattern of
-#' \code{do.call(rbind, dfs)} or \code{do.call(cbind, dfs)} for binding many
-#' data frames into one. \code{combine()} acts like \code{\link{c}()} or
-#' \code{\link{unlist}()} but uses consistent dplyr coercion rules.
+#' `do.call(rbind, dfs)` or `do.call(cbind, dfs)` for binding many
+#' data frames into one. `combine()` acts like [c()] or
+#' [unlist()] but uses consistent dplyr coercion rules.
+#'
+#' The output of `bind_rows()` will contain a column if that column
+#' appears in any of the inputs.
#'
#' @section Deprecated functions:
-#' \code{rbind_list()} and \code{rbind_all()} have been deprecated. Instead use
-#' \code{bind_rows()}.
+#' `rbind_list()` and `rbind_all()` have been deprecated. Instead use
+#' `bind_rows()`.
#'
#' @param ... Data frames to combine.
#'
#' Each argument can either be a data frame, a list that could be a data
#' frame, or a list of data frames.
#'
-#' When column-binding, rows are matched by position, not value so all data
+#' When row-binding, columns are matched by name, and any missing
+#' columns with be filled with NA.
+#'
+#' When column-binding, rows are matched by position, so all data
#' frames must have the same number of rows. To match by value, not
-#' position, see \code{left_join} etc. When row-binding, columns are
-#' matched by name, and any values that don't match will be filled with NA.
-#' @param .id Data frames identifier.
+#' position, see [join].
+#' @param .id Data frame identifier.
#'
-#' When \code{.id} is supplied, a new column of identifiers is
+#' When `.id` is supplied, a new column of identifiers is
#' created to link each row to its original data frame. The labels
-#' are taken from the named arguments to \code{bind_rows()}. When a
+#' are taken from the named arguments to `bind_rows()`. When a
#' list of data frames is supplied, the labels are taken from the
#' names of the list. If no names are found a numeric sequence is
#' used instead.
-#' @return \code{bind_rows} and \code{bind_cols} return the same type as
-#' the first input, either a data frame, \code{tbl_df}, or \code{grouped_df}.
+#' @return `bind_rows()` and `bind_cols()` return the same type as
+#' the first input, either a data frame, `tbl_df`, or `grouped_df`.
#' @aliases rbind_all rbind_list
#' @examples
#' one <- mtcars[1:4, ]
#' two <- mtcars[11:14, ]
#'
-#' # You can either supply data frames as arguments
+#' # You can supply data frames as arguments:
#' bind_rows(one, two)
-#' # Or a single argument containing a list of data frames
+#'
+#' # The contents of lists is automatically spliced:
#' bind_rows(list(one, two))
#' bind_rows(split(mtcars, mtcars$cyl))
+#' bind_rows(list(one, two), list(two, one))
+#'
+#'
+#' # In addition to data frames, you can supply vectors. In the rows
+#' # direction, the vectors represent rows and should have inner
+#' # names:
+#' bind_rows(
+#' c(a = 1, b = 2),
+#' c(a = 3, b = 4)
+#' )
+#'
+#' # You can mix vectors and data frames:
+#' bind_rows(
+#' c(a = 1, b = 2),
+#' data_frame(a = 3:4, b = 5:6),
+#' c(a = 7, b = 8)
+#' )
+#'
+#'
+#' # Note that for historical reasons, lists containg vectors are
+#' # always treated as data frames. Thus their vectors are treated as
+#' # columns rather than rows, and their inner names are ignored:
+#' ll <- list(
+#' a = c(A = 1, B = 2),
+#' b = c(A = 3, B = 4)
+#' )
+#' bind_rows(ll)
+#'
+#' # You can circumvent that behaviour with explicit splicing:
+#' bind_rows(!!! ll)
+#'
#'
#' # When you supply a column name with the `.id` argument, a new
#' # column is created to link each row to its original data frame
@@ -69,24 +106,38 @@ NULL
#' @export
#' @rdname bind
bind_rows <- function(..., .id = NULL) {
- x <- list_or_dots(...)
+ x <- flatten_bindable(dots_values(...))
+
+ if (!length(x)) {
+ # Handle corner cases gracefully, but always return a tibble
+ if (inherits(x, "data.frame")) {
+ return(x)
+ } else {
+ return(tibble())
+ }
+ }
- if (!is.null(.id)) {
- if (!(is.character(.id) && length(.id) == 1)) {
- stop(".id is not a string", call. = FALSE)
+ if (!is_null(.id)) {
+ if (!(is_string(.id))) {
+ bad_args(".id", "must be a scalar string, ",
+ "not {type_of(.id)} of length {length(.id)}"
+ )
+ }
+ if (!all(have_name(x) | map_lgl(x, is_empty))) {
+ x <- compact(x)
+ names(x) <- seq_along(x)
}
- names(x) <- names(x) %||% seq_along(x)
}
bind_rows_(x, .id)
}
-
#' @export
#' @rdname bind
bind_cols <- function(...) {
- x <- list_or_dots(...)
- cbind_all(x)
+ x <- flatten_bindable(dots_values(...))
+ out <- cbind_all(x)
+ tibble::repair_names(out)
}
#' @export
@@ -99,65 +150,3 @@ combine <- function(...) {
combine_all(args)
}
}
-
-list_or_dots <- function(...) {
- dots <- list(...)
-
- # Need to ensure that each component is a data list:
- data_lists <- vapply(dots, is_data_list, logical(1))
- dots[data_lists] <- lapply(dots[data_lists], list)
-
- unlist(dots, recursive = FALSE)
-}
-
-# Is this object a
-is_data_list <- function(x) {
- # data frames are trivially data list, and so are nulls
- if (is.data.frame(x) || is.null(x))
- return(TRUE)
-
- # Must be a list
- if (!is.list(x))
- return(FALSE)
-
- # 0 length named list (#1515)
- if( !is.null(names(x)) && length(x) == 0)
- return(TRUE)
-
- # With names
- if (any(!has_names(x)))
- return(FALSE)
-
- # Where each element is an 1d vector or list
- is_1d <- vapply(x, is_1d, logical(1))
- if (any(!is_1d))
- return(FALSE)
-
- # All of which have the same length
- n <- vapply(x, length, integer(1))
- if (any(n != n[1]))
- return(FALSE)
-
- TRUE
-}
-
-
-# Deprecated functions ----------------------------------------------------
-
-#' @export
-#' @rdname bind
-#' @usage NULL
-rbind_list <- function(...){
- warning("`rbind_list()` is deprecated. Please use `bind_rows()` instead.",
- call. = FALSE)
- rbind_list__impl(environment())
-}
-
-#' @export
-#' @rdname bind
-#' @usage NULL
-rbind_all <- function(x, id = NULL) {
- warning("`rbind_all()` is deprecated. Please use `bind_rows()` instead.",
- call. = FALSE)
- bind_rows_(x, id = id)
-}
diff --git a/R/case_when.R b/R/case_when.R
index 6fd5d16..57e46bb 100644
--- a/R/case_when.R
+++ b/R/case_when.R
@@ -1,7 +1,7 @@
-#' A general vectorised if.
+#' A general vectorised if
#'
-#' This function allows you to vectorise mutiple \code{if} and \code{else if}
-#' statements. It is an R equivalent of the SQL \code{CASE WHEN} statement.
+#' This function allows you to vectorise multiple `if` and `else if`
+#' statements. It is an R equivalent of the SQL `CASE WHEN` statement.
#'
#' @param ... A sequence of two-sided formulas. The left hand side (LHS)
#' determines which values match this case. The right hand side (RHS)
@@ -10,9 +10,11 @@
#' The LHS must evaluate to a logical vector. Each logical vector can
#' either have length 1 or a common length. All RHSs must evaluate to
#' the same type of vector.
+#'
+#' These dots are evaluated with [explicit splicing][rlang::dots_list].
#' @export
-#' @return A vector as long as the longest LHS, with the type (and
-#' attributes) of the first RHS. Inconsistent lengths of types will
+#' @return A vector as long as the longest LHS or RHS, with the type (and
+#' attributes) of the first RHS. Inconsistent lengths or types will
#' generate an error.
#' @examples
#' x <- 1:50
@@ -31,12 +33,34 @@
#' x %% 7 == 0 ~ "buzz",
#' x %% 35 == 0 ~ "fizz buzz"
#' )
+#'
+#' # case_when is particularly useful inside mutate when you want to
+#' # create a new variable that relies on a complex combination of existing
+#' # variables
+#' starwars %>%
+#' select(name:mass, gender, species) %>%
+#' mutate(
+#' type = case_when(
+#' height > 200 | mass > 200 ~ "large",
+#' species == "Droid" ~ "robot",
+#' TRUE ~ "other"
+#' )
+#' )
+#'
+#' # Dots support splicing:
+#' patterns <- list(
+#' x %% 35 == 0 ~ "fizz buzz",
+#' x %% 5 == 0 ~ "fizz",
+#' x %% 7 == 0 ~ "buzz",
+#' TRUE ~ as.character(x)
+#' )
+#' case_when(!!! patterns)
case_when <- function(...) {
- formulas <- list(...)
+ formulas <- dots_list(...)
n <- length(formulas)
if (n == 0) {
- stop("No cases provided", call. = FALSE)
+ abort("No cases provided")
}
query <- vector("list", n)
@@ -46,35 +70,47 @@ case_when <- function(...) {
f <- formulas[[i]]
if (!inherits(f, "formula") || length(f) != 3) {
non_formula_arg <- substitute(list(...))[[i + 1]]
- stop("Case ", i , " (", deparse_trunc(non_formula_arg),
- ") is not a two-sided formula", call. = FALSE)
+ header <- glue("Case {i} ({deparsed})", deparsed = fmt_obj1(deparse_trunc(non_formula_arg)))
+ glubort(header, "must be a two-sided formula, not a {type_of(f)}")
}
env <- environment(f)
- query[[i]] <- eval(f[[2]], envir = env)
+ query[[i]] <- eval_bare(f[[2]], env)
if (!is.logical(query[[i]])) {
- stop("LHS of case ", i, " (", deparse_trunc(f_lhs(f)),
- ") is ", typeof(query[[i]]), ", not logical",
- call. = FALSE)
+ header <- glue("LHS of case {i} ({deparsed})", deparsed = fmt_obj1(deparse_trunc(f_lhs(f))))
+ glubort(header, "must be a logical, not {type_of(query[[i]])}")
}
- value[[i]] <- eval(f[[3]], envir = env)
+ value[[i]] <- eval_bare(f[[3]], env)
+ }
+
+ lhs_lengths <- map_int(query, length)
+ rhs_lengths <- map_int(value, length)
+ all_lengths <- unique(c(lhs_lengths, rhs_lengths))
+ if (length(all_lengths) <= 1) {
+ m <- all_lengths[[1]]
+ } else {
+ non_atomic_lengths <- all_lengths[all_lengths != 1]
+ m <- non_atomic_lengths[[1]]
+ if (length(non_atomic_lengths) > 1) {
+ inconsistent_lengths <- non_atomic_lengths[-1]
+ lhs_problems <- lhs_lengths %in% inconsistent_lengths
+ rhs_problems <- rhs_lengths %in% inconsistent_lengths
+ problems <- lhs_problems | rhs_problems
+ bad_calls(
+ formulas[problems],
+ check_length_val(inconsistent_lengths, m, header = NULL, .abort = identity)
+ )
+ }
}
- m <- max(vapply(query, length, integer(1)))
out <- value[[1]][rep(NA_integer_, m)]
replaced <- rep(FALSE, m)
for (i in seq_len(n)) {
- check_length(
- query[[i]], out,
- paste0("LHS of case ", i, " (", deparse_trunc(f_lhs(formulas[[i]])), ")"))
-
- out <- replace_with(
- out, query[[i]] & !replaced, value[[i]],
- paste0("RHS of case ", i, " (", deparse_trunc(f_rhs(formulas[[i]])), ")"))
- replaced <- replaced | query[[i]]
+ out <- replace_with(out, query[[i]] & !replaced, value[[i]], NULL)
+ replaced <- replaced | (query[[i]] & !is.na(query[[i]]))
}
out
diff --git a/R/coalesce.R b/R/coalesce.R
index 6f5f87d..3ccd641 100644
--- a/R/coalesce.R
+++ b/R/coalesce.R
@@ -1,14 +1,16 @@
#' Find first non-missing element
#'
-#' Given a set of vectors, \code{coelesce} finds the first non-missing value
-#' at each position. This is inspired by the SQL \code{COALESCE} function
-#' which does the same thing for \code{NULL}s.
+#' Given a set of vectors, `coalesce()` finds the first non-missing value
+#' at each position. This is inspired by the SQL `COALESCE` function
+#' which does the same thing for `NULL`s.
#'
-#' @param x,... Vectors. All inputs should either be length 1, or the
-#' same length as \code{x}
-#' @return A vector the same length as \code{x} with missing values replaced
-#' by the first non-missing value.
-#' @seealso \code{\link{na_if}()} to replace specified values with a \code{NA}.
+#' @param ... Vectors. All inputs should either be length 1, or the
+#' same length as the first argument.
+#'
+#' These dots are evaluated with [explicit splicing][rlang::dots_list].
+#' @return A vector the same length as the first `...` argument with
+#' missing values replaced by the first non-missing value.
+#' @seealso [na_if()] to replace specified values with a `NA`.
#' @export
#' @examples
#' # Use a single value to replace all missing values
@@ -19,10 +21,28 @@
#' y <- c(1, 2, NA, NA, 5)
#' z <- c(NA, NA, 3, 4, 5)
#' coalesce(y, z)
-coalesce <- function(x, ...) {
- values <- list(...)
+#'
+#' # Supply lists by splicing them into dots:
+#' vecs <- list(
+#' c(1, 2, NA, NA, 5),
+#' c(NA, NA, 3, 4, 5)
+#' )
+#' coalesce(!!! vecs)
+coalesce <- function(...) {
+ if (missing(..1)) {
+ abort("At least one argument must be supplied")
+ }
+
+ values <- dots_list(...)
+ x <- values[[1]]
+ values <- values[-1]
+
for (i in seq_along(values)) {
- x <- replace_with(x, is.na(x), values[[i]], paste0("Vector ", i))
+ x <- replace_with(
+ x, is.na(x), values[[i]],
+ glue("Argument {i + 1}"),
+ glue("length of {fmt_args(~x)}")
+ )
}
x
}
diff --git a/R/colwise-arrange.R b/R/colwise-arrange.R
new file mode 100644
index 0000000..44a40bc
--- /dev/null
+++ b/R/colwise-arrange.R
@@ -0,0 +1,43 @@
+#' Arrange rows by a selection of variables
+#'
+#' These [scoped] variants of [arrange()] sort a data frame by a
+#' selection of variables. Like [arrange()], you can modify the
+#' variables before ordering with [funs()].
+#'
+#' @inheritParams scoped
+#' @export
+#' @examples
+#' df <- as_tibble(mtcars)
+#' df
+#' arrange_all(df)
+#'
+#' # You can supply a function that will be applied before taking the
+#' # ordering of the variables. The variables of the sorted tibble
+#' # keep their original values.
+#' arrange_all(df, desc)
+#' arrange_all(df, funs(desc(.)))
+arrange_all <- function(.tbl, .funs = list(), ...) {
+ funs <- manip_all(.tbl, .funs, enquo(.funs), caller_env(), ...)
+ if (!length(funs)) {
+ funs <- syms(tbl_vars(.tbl))
+ }
+ arrange(.tbl, !!! funs)
+}
+#' @rdname arrange_all
+#' @export
+arrange_at <- function(.tbl, .vars, .funs = list(), ...) {
+ funs <- manip_at(.tbl, .vars, .funs, enquo(.funs), caller_env(), ...)
+ if (!length(funs)) {
+ funs <- tbl_at_syms(.tbl, .vars)
+ }
+ arrange(.tbl, !!! funs)
+}
+#' @rdname arrange_all
+#' @export
+arrange_if <- function(.tbl, .predicate, .funs = list(), ...) {
+ funs <- manip_if(.tbl, .predicate, .funs, enquo(.funs), caller_env(), ...)
+ if (!length(funs)) {
+ funs <- tbl_if_syms(.tbl, .predicate)
+ }
+ arrange(.tbl, !!! funs)
+}
diff --git a/R/colwise-filter.R b/R/colwise-filter.R
new file mode 100644
index 0000000..1c2c618
--- /dev/null
+++ b/R/colwise-filter.R
@@ -0,0 +1,76 @@
+#' Filter within a selection of variables
+#'
+#' These [scoped] filtering verbs apply a predicate expression to a
+#' selection of variables. The predicate expression should be quoted
+#' with [all_vars()] or [any_vars()] and should mention the pronoun
+#' `.` to refer to variables.
+#'
+#' @inheritParams scoped
+#' @param .vars_predicate A quoted predicate expression as returned by
+#' [all_vars()] or [any_vars()].
+#' @export
+#' @examples
+#' # While filter() accepts expressions with specific variables, the
+#' # scoped filter verbs take an expression with the pronoun `.` and
+#' # replicate it over all variables. This expression should be quoted
+#' # with all_vars() or any_vars():
+#' all_vars(is.na(.))
+#' any_vars(is.na(.))
+#'
+#'
+#' # You can take the intersection of the replicated expressions:
+#' filter_all(mtcars, all_vars(. > 150))
+#'
+#' # Or the union:
+#' filter_all(mtcars, any_vars(. > 150))
+#'
+#'
+#' # You can vary the selection of columns on which to apply the
+#' # predicate. filter_at() takes a vars() specification:
+#' filter_at(mtcars, vars(starts_with("d")), any_vars((. %% 2) == 0))
+#'
+#' # And filter_if() selects variables with a predicate function:
+#' filter_if(mtcars, ~ all(floor(.) == .), all_vars(. != 0))
+filter_all <- function(.tbl, .vars_predicate) {
+ syms <- syms(tbl_nongroup_vars(.tbl))
+ pred <- apply_filter_syms(.vars_predicate, syms, .tbl)
+ filter(.tbl, !! pred)
+}
+#' @rdname filter_all
+#' @export
+filter_if <- function(.tbl, .predicate, .vars_predicate) {
+ syms <- tbl_if_syms(.tbl, .predicate)
+ pred <- apply_filter_syms(.vars_predicate, syms, .tbl)
+ filter(.tbl, !! pred)
+}
+#' @rdname filter_all
+#' @export
+filter_at <- function(.tbl, .vars, .vars_predicate) {
+ syms <- tbl_at_syms(.tbl, .vars)
+ pred <- apply_filter_syms(.vars_predicate, syms, .tbl)
+ filter(.tbl, !! pred)
+}
+
+apply_filter_syms <- function(pred, syms, tbl) {
+ if (is_empty(syms)) {
+ bad_args(".predicate", "has no matching columns")
+ }
+
+ if (inherits(pred, "all_vars")) {
+ joiner <- all_exprs
+ } else if (inherits(pred, "any_vars")) {
+ joiner <- any_exprs
+ } else {
+ bad_args(".vars_predicate", "must be a call to `all_vars()` or `any_vars()`, ",
+ "not {type_of(pred)}"
+ )
+ }
+
+ pred <- map(syms, function(sym) expr_substitute(pred, quote(.), sym))
+
+ if (length(pred)) {
+ joiner(!!! pred)
+ } else {
+ pred
+ }
+}
diff --git a/R/colwise-group-by.R b/R/colwise-group-by.R
new file mode 100644
index 0000000..7b2f1eb
--- /dev/null
+++ b/R/colwise-group-by.R
@@ -0,0 +1,48 @@
+#' Group by a selection of variables
+#'
+#' These [scoped] variants of [group_by()] group a data frame by a
+#' selection of variables. Like [group_by()], they have optional
+#' [mutate] semantics.
+#'
+#' @inheritParams scoped
+#' @param .add Passed to the `add` argument of [group_by()].
+#' @export
+#' @examples
+#' # Group a data frame by all variables:
+#' group_by_all(mtcars)
+#'
+#' # Group by variables selected with a predicate:
+#' group_by_if(iris, is.factor)
+#'
+#' # Group by variables selected by name:
+#' group_by_at(mtcars, vars(vs, am))
+#'
+#' # Like group_by(), the scoped variants have optional mutate
+#' # semantics. This provide a shortcut for group_by() + mutate():
+#' group_by_all(mtcars, as.factor)
+#' group_by_if(iris, is.factor, as.character)
+group_by_all <- function(.tbl, .funs = list(), ...) {
+ funs <- manip_all(.tbl, .funs, enquo(.funs), caller_env(), ...)
+ if (!length(funs)) {
+ funs <- syms(tbl_vars(.tbl))
+ }
+ group_by(.tbl, !!! funs)
+}
+#' @rdname group_by_all
+#' @export
+group_by_at <- function(.tbl, .vars, .funs = list(), ..., .add = FALSE) {
+ funs <- manip_at(.tbl, .vars, .funs, enquo(.funs), caller_env(), ...)
+ if (!length(funs)) {
+ funs <- tbl_at_syms(.tbl, .vars)
+ }
+ group_by(.tbl, !!! funs, add = .add)
+}
+#' @rdname group_by_all
+#' @export
+group_by_if <- function(.tbl, .predicate, .funs = list(), ..., .add = FALSE) {
+ funs <- manip_if(.tbl, .predicate, .funs, enquo(.funs), caller_env(), ...)
+ if (!length(funs)) {
+ funs <- tbl_if_syms(.tbl, .predicate)
+ }
+ group_by(.tbl, !!! funs, add = .add)
+}
diff --git a/R/colwise-mutate.R b/R/colwise-mutate.R
new file mode 100644
index 0000000..ab6d989
--- /dev/null
+++ b/R/colwise-mutate.R
@@ -0,0 +1,268 @@
+#' Summarise and mutate multiple columns.
+#'
+#' @description
+#' These verbs are [scoped] variants of [summarise()], [mutate()] and
+#' [transmute()]. They apply operations on a selection of variables.
+#'
+#' * `summarise_all()`, `mutate_all()` and `transmute_all()` apply the
+#' functions to all (non-grouping) columns.
+#'
+#' * `summarise_at()`, `mutate_at()` and `transmute_at()` allow you to
+#' select columns using the same name-based [select_helpers] just
+#' like with [select()].
+#'
+#' * `summarise_if`(), `mutate_if`() and `transmute_if()` operate on
+#' columns for which a predicate returns `TRUE`.
+#' @inheritParams scoped
+#' @param .cols This argument has been renamed to `.vars` to fit
+#' dplyr's terminology and is deprecated.
+#' @return A data frame. By default, the newly created columns have the shortest
+#' names needed to uniquely identify the output. To force inclusion of a name,
+#' even when not needed, name the input (see examples for details).
+#' @seealso [vars()], [funs()]
+#' @export
+#' @examples
+#' # The scoped variants of summarise() and mutate() make it easy to
+#' # apply the same transformation to multiple variables:
+#'
+#' iris %>%
+#' group_by(Species) %>%
+#' summarise_all(mean)
+#'
+#' # There are three variants.
+#' # * _all affects every variable
+#' # * _at affects variables selected with a character vector or vars()
+#' # * _if affects variables selected with a predicate function:
+#'
+#' starwars %>% summarise_at(vars(height:mass), mean, na.rm = TRUE)
+#' starwars %>% summarise_at(c("height", "mass"), mean, na.rm = TRUE)
+#' starwars %>% summarise_if(is.numeric, mean, na.rm = TRUE)
+#'
+#' # mutate_if is particularly useful for transforming variables from
+#' # one type to another
+#' iris %>% as_tibble() %>% mutate_if(is.factor, as.character)
+#' iris %>% as_tibble() %>% mutate_if(is.double, as.integer)
+#'
+#' # ---------------------------------------------------------------------------
+#' # If you want apply multiple transformations, use funs()
+#' by_species <- iris %>% group_by(Species)
+#'
+#' by_species %>% summarise_all(funs(min, max))
+#' # Note that output variable name now includes the function name, in order to
+#' # keep things distinct.
+#'
+#' # You can express more complex inline transformations using .
+#' by_species %>% mutate_all(funs(. / 2.54))
+#'
+#' # Function names will be included if .funs has names or multiple inputs
+#' by_species %>% mutate_all(funs(cm = . / 2.54))
+#' by_species %>% summarise_all(funs(med = median))
+#' by_species %>% summarise_all(funs(Q3 = quantile), probs = 0.75)
+#' by_species %>% summarise_all(c("min", "max"))
+summarise_all <- function(.tbl, .funs, ...) {
+ funs <- manip_all(.tbl, .funs, enquo(.funs), caller_env(), ...)
+ summarise(.tbl, !!! funs)
+}
+#' @rdname summarise_all
+#' @export
+summarise_if <- function(.tbl, .predicate, .funs, ...) {
+ funs <- manip_if(.tbl, .predicate, .funs, enquo(.funs), caller_env(), ...)
+ summarise(.tbl, !!! funs)
+}
+#' @rdname summarise_all
+#' @export
+summarise_at <- function(.tbl, .vars, .funs, ..., .cols = NULL) {
+ .vars <- check_dot_cols(.vars, .cols)
+ funs <- manip_at(.tbl, .vars, .funs, enquo(.funs), caller_env(), ...)
+ summarise(.tbl, !!! funs)
+}
+
+#' @rdname summarise_all
+#' @export
+summarize_all <- summarise_all
+#' @rdname summarise_all
+#' @export
+summarize_if <- summarise_if
+#' @rdname summarise_all
+#' @export
+summarize_at <- summarise_at
+
+#' @rdname summarise_all
+#' @export
+mutate_all <- function(.tbl, .funs, ...) {
+ funs <- manip_all(.tbl, .funs, enquo(.funs), caller_env(), ...)
+ mutate(.tbl, !!! funs)
+}
+#' @rdname summarise_all
+#' @export
+mutate_if <- function(.tbl, .predicate, .funs, ...) {
+ funs <- manip_if(.tbl, .predicate, .funs, enquo(.funs), caller_env(), ...)
+ mutate(.tbl, !!! funs)
+}
+#' @rdname summarise_all
+#' @export
+mutate_at <- function(.tbl, .vars, .funs, ..., .cols = NULL) {
+ .vars <- check_dot_cols(.vars, .cols)
+ funs <- manip_at(.tbl, .vars, .funs, enquo(.funs), caller_env(), ...)
+ mutate(.tbl, !!! funs)
+}
+
+#' @rdname summarise_all
+#' @export
+transmute_all <- function(.tbl, .funs, ...) {
+ funs <- manip_all(.tbl, .funs, enquo(.funs), caller_env(), ...)
+ transmute(.tbl, !!! funs)
+}
+#' @rdname summarise_all
+#' @export
+transmute_if <- function(.tbl, .predicate, .funs, ...) {
+ funs <- manip_if(.tbl, .predicate, .funs, enquo(.funs), caller_env(), ...)
+ transmute(.tbl, !!! funs)
+}
+#' @rdname summarise_all
+#' @export
+transmute_at <- function(.tbl, .vars, .funs, ..., .cols = NULL) {
+ .vars <- check_dot_cols(.vars, .cols)
+ funs <- manip_at(.tbl, .vars, .funs, enquo(.funs), caller_env(), ...)
+ transmute(.tbl, !!! funs)
+}
+
+# Helpers -----------------------------------------------------------------
+
+manip_all <- function(.tbl, .funs, .quo, .env, ...) {
+ syms <- syms(tbl_nongroup_vars(.tbl))
+ funs <- as_fun_list(.funs, .quo, .env, ...)
+ manip_apply_syms(funs, syms, .tbl)
+}
+manip_if <- function(.tbl, .predicate, .funs, .quo, .env, ...) {
+ vars <- tbl_if_syms(.tbl, .predicate, .env)
+ funs <- as_fun_list(.funs, .quo, .env, ...)
+ manip_apply_syms(funs, vars, .tbl)
+}
+manip_at <- function(.tbl, .vars, .funs, .quo, .env, ...) {
+ syms <- tbl_at_syms(.tbl, .vars)
+ funs <- as_fun_list(.funs, .quo, .env, ...)
+ manip_apply_syms(funs, syms, .tbl)
+}
+
+check_dot_cols <- function(vars, cols) {
+ if (is_null(cols)) {
+ vars
+ } else {
+ inform("`.cols` has been renamed and is deprecated, please use `.vars`")
+ if (missing(vars)) cols else vars
+ }
+}
+
+manip_apply_syms <- function(funs, syms, tbl) {
+ stopifnot(is_fun_list(funs))
+
+ out <- vector("list", length(syms) * length(funs))
+ dim(out) <- c(length(syms), length(funs))
+ for (i in seq_along(syms)) {
+ for (j in seq_along(funs)) {
+ var_sym <- sym(syms[[i]])
+ out[[i, j]] <- expr_substitute(funs[[j]], quote(.), var_sym)
+ }
+ }
+ dim(out) <- NULL
+
+ # Use symbols as default names
+ unnamed <- !have_name(syms)
+ names(syms)[unnamed] <- map_chr(syms[unnamed], as_string)
+
+ if (length(funs) == 1 && !attr(funs, "have_name")) {
+ names(out) <- names(syms)
+ } else if (length(syms) == 1 && all(unnamed)) {
+ names(out) <- names(funs)
+ } else {
+ syms_names <- map_chr(syms, as_string)
+ grid <- expand.grid(var = syms_names, call = names(funs))
+ names(out) <- paste(grid$var, grid$call, sep = "_")
+ }
+
+ out
+}
+
+# Deprecated --------------------------------------------------------------
+
+#' Summarise and mutate multiple columns.
+#'
+#' @description
+#'
+#' `mutate_each()` and `summarise_each()` are deprecated in favour of
+#' a more featureful family of functions: [mutate_all()],
+#' [mutate_at()], [mutate_if()], [summarise_all()], [summarise_at()]
+#' and [summarise_if()].
+#'
+#' The `_each()` functions have two replacements depending on what
+#' variables you want to apply `funs` to. To apply a function to all
+#' variables, use [mutate_all()] or [summarise_all()]. To apply a
+#' function to a selection of variables, use [mutate_at()] or
+#' [summarise_at()].
+#'
+#' See the relevant section of `vignette("compatibility")` for more
+#' information.
+#'
+#' @keywords internal
+#' @export
+summarise_each <- function(tbl, funs, ...) {
+ summarise_each_(tbl, funs, quos(...))
+}
+#' @export
+#' @rdname summarise_each
+summarise_each_ <- function(tbl, funs, vars) {
+ msg <- glue(
+ "`summarise_each()` is deprecated.
+ Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead."
+ )
+ if (is_empty(vars)) {
+ inform(glue(msg, "\nTo map `funs` over all variables, use `summarise_all()`"))
+ vars <- tbl_nongroup_vars(tbl)
+ } else {
+ inform(glue(msg, "\nTo map `funs` over a selection of variables, use `summarise_at()`"))
+ vars <- compat_lazy_dots(vars, caller_env())
+ vars <- select_vars(tbl_nongroup_vars(tbl), !!! vars)
+ }
+ if (is_character(funs)) {
+ funs <- funs_(funs)
+ }
+
+ funs <- manip_apply_syms(funs, syms(vars), tbl)
+ summarise(tbl, !!! funs)
+}
+
+#' @export
+#' @rdname summarise_each
+mutate_each <- function(tbl, funs, ...) {
+ if (is_character(funs)) {
+ funs <- funs_(funs)
+ }
+
+ mutate_each_(tbl, funs, quos(...))
+}
+#' @export
+#' @rdname summarise_each
+mutate_each_ <- function(tbl, funs, vars) {
+ msg <- glue(
+ "`mutate_each()` is deprecated.
+ Use `mutate_all()`, `mutate_at()` or `mutate_if()` instead."
+ )
+ if (is_empty(vars)) {
+ inform(glue(msg, "\nTo map `funs` over all variables, use `mutate_all()`"))
+ vars <- tbl_nongroup_vars(tbl)
+ } else {
+ inform(glue(msg, "\nTo map `funs` over a selection of variables, use `mutate_at()`"))
+ vars <- compat_lazy_dots(vars, caller_env())
+ vars <- select_vars(tbl_nongroup_vars(tbl), !!! vars)
+ }
+ funs <- manip_apply_syms(funs, syms(vars), tbl)
+ mutate(tbl, !!! funs)
+}
+
+#' @rdname summarise_each
+#' @export
+summarize_each <- summarise_each
+#' @rdname summarise_each
+#' @export
+summarize_each_ <- summarise_each_
diff --git a/R/colwise-select.R b/R/colwise-select.R
new file mode 100644
index 0000000..6b4502e
--- /dev/null
+++ b/R/colwise-select.R
@@ -0,0 +1,97 @@
+#' Select and rename a selection of variables
+#'
+#' @description
+#'
+#' These [scoped] variants of [select()] and [rename()] operate on a
+#' selection of variables. The semantics of these verbs have simple
+#' but important differences:
+#'
+#' * Selection drops variables that are not in the selection while
+#' renaming retains them.
+#'
+#' * The renaming function is optional for selection but not for
+#' renaming.
+#'
+#' @inheritParams scoped
+#' @param .funs A single expression quoted with [funs()] or within a
+#' quosure, a string naming a function, or a function.
+#' @export
+#' @examples
+#' # Supply a renaming function:
+#' select_all(mtcars, toupper)
+#' select_all(mtcars, "toupper")
+#' select_all(mtcars, funs(toupper(.)))
+#'
+#' # Selection drops unselected variables:
+#' is_whole <- function(x) all(floor(x) == x)
+#' select_if(mtcars, is_whole, toupper)
+#'
+#' # But renaming retains them:
+#' rename_if(mtcars, is_whole, toupper)
+#'
+#' # The renaming function is optional for selection:
+#' select_if(mtcars, is_whole)
+select_all <- function(.tbl, .funs = list(), ...) {
+ funs <- as_fun_list(.funs, enquo(.funs), caller_env(), ...)
+ vars <- tbl_nongroup_vars(.tbl)
+ syms <- vars_select_syms(vars, funs, .tbl)
+ select(.tbl, !!! syms)
+}
+#' @rdname select_all
+#' @export
+rename_all <- function(.tbl, .funs = list(), ...) {
+ funs <- as_fun_list(.funs, enquo(.funs), caller_env(), ...)
+ vars <- tbl_nongroup_vars(.tbl)
+ syms <- vars_select_syms(vars, funs, .tbl, strict = TRUE)
+ rename(.tbl, !!! syms)
+}
+
+#' @rdname select_all
+#' @export
+select_if <- function(.tbl, .predicate, .funs = list(), ...) {
+ funs <- as_fun_list(.funs, enquo(.funs), caller_env(), ...)
+ vars <- tbl_if_vars(.tbl, .predicate, caller_env())
+ syms <- vars_select_syms(vars, funs, .tbl)
+ select(.tbl, !!! syms)
+}
+#' @rdname select_all
+#' @export
+rename_if <- function(.tbl, .predicate, .funs = list(), ...) {
+ funs <- as_fun_list(.funs, enquo(.funs), caller_env(), ...)
+ vars <- tbl_if_vars(.tbl, .predicate, caller_env())
+ syms <- vars_select_syms(vars, funs, .tbl, strict = TRUE)
+ rename(.tbl, !!! syms)
+}
+
+#' @rdname select_all
+#' @export
+select_at <- function(.tbl, .vars, .funs = list(), ...) {
+ vars <- tbl_at_vars(.tbl, .vars)
+ funs <- as_fun_list(.funs, enquo(.funs), caller_env(), ...)
+ syms <- vars_select_syms(vars, funs, .tbl)
+ select(.tbl, !!! syms)
+}
+#' @rdname select_all
+#' @export
+rename_at <- function(.tbl, .vars, .funs = list(), ...) {
+ vars <- tbl_at_vars(.tbl, .vars)
+ funs <- as_fun_list(.funs, enquo(.funs), caller_env(), ...)
+ syms <- vars_select_syms(vars, funs, .tbl, strict = TRUE)
+ rename(.tbl, !!! syms)
+}
+
+vars_select_syms <- function(vars, funs, tbl, strict = FALSE) {
+ if (length(funs) > 1) {
+ bad_args(".funs", "must contain one renaming function, not {length(funs)}")
+ } else if (length(funs) == 1) {
+ fun <- as_function(funs[[1]])
+ syms <- set_names(syms(vars), fun(vars))
+ } else if (!strict) {
+ syms <- syms(vars)
+ } else {
+ bad_args(".funs", "must specify a renaming function")
+ }
+
+ group_syms <- base::setdiff(syms(group_vars(tbl)), syms)
+ c(group_syms, syms)
+}
diff --git a/R/colwise.R b/R/colwise.R
index 6049448..fce5bac 100644
--- a/R/colwise.R
+++ b/R/colwise.R
@@ -1,294 +1,178 @@
-#' Summarise and mutate multiple columns.
+#' Operate on a selection of variables
#'
-#' \code{summarise_all()} and \code{mutate_all()} apply the functions
-#' to all (non-grouping) columns. \code{summarise_at()} and
-#' \code{mutate_at()} allow you to select columns
-#' using the same name-based \code{\link{select_helpers}} as with
-#' \code{\link{select}()}. \code{summarise_if}() and
-#' \code{mutate_if}() operate on columns for which a predicate returns
-#' \code{TRUE}. Finally, \code{\link{summarise_each}()} and
-#' \code{\link{mutate_each}()} are older variants that will be
-#' deprecated in the future.
+#' The variants suffixed with `_if`, `_at` or `_all` apply an
+#' expression (sometimes several) to all variables within a specified
+#' subset. This subset can contain all variables (`_all` variants), a
+#' [vars()] selection (`_at` variants), or variables selected with a
+#' predicate (`_if` variants).
#'
-#' @param .tbl a tbl
-#' @param .funs List of function calls generated by
-#' \code{\link{funs}()}, or a character vector of function names, or
-#' simply a function (only for local sources).
-#' @param .cols A list of columns generated by \code{\link{vars}()},
-#' or a character vector of column names, or a numeric vector of column
-#' positions.
-#' @param .predicate A predicate function to be applied to the columns
-#' or a logical vector. The columns for which \code{.predicate} is
-#' or returns \code{TRUE} will be summarised or mutated.
-#' @param ... Additional arguments for the function calls. These are
-#' evaluated only once.
-#' @return A data frame. By default, the newly created columns have the shortest
-#' names needed to distinguish the output. To force inclusion of a name,
-#' even when not needed, name the input (see examples for details).
-#' @seealso \code{\link{vars}()}, \code{\link{funs}()}
-#' @examples
-#' by_species <- iris %>% group_by(Species)
+#' The verbs with scoped variants are:
#'
-#' # One function
-#' by_species %>% summarise_all(n_distinct)
-#' by_species %>% summarise_all(mean)
+#' * [mutate()], [transmute()] and [summarise()]. See [summarise_all()].
+#' * [filter()]. See [filter_all()].
+#' * [group_by()]. See [group_by_all()].
+#' * [rename()] and [select()]. See [select_all()].
+#' * [arrange()]. See [arrange_all()]
#'
-#' # Use the _at and _if variants for conditional mapping.
-#' by_species %>% summarise_if(is.numeric, mean)
+#' There are three kinds of scoped variants. They differ in the scope
+#' of the variable selection on which operations are applied:
#'
-#' # summarise_at() can use select() helpers with the vars() function:
-#' by_species %>% summarise_at(vars(Petal.Width), mean)
-#' by_species %>% summarise_at(vars(matches("Width")), mean)
+#' * Verbs suffixed with `_all()` apply an operation on all variables.
#'
-#' # You can also specify columns with column names or column positions:
-#' by_species %>% summarise_at(c("Sepal.Width", "Petal.Width"), mean)
-#' by_species %>% summarise_at(c(1, 3), mean)
+#' * Verbs suffixed with `_at()` apply an operation on a subset of
+#' variables specified with the quoting function [vars()]. This
+#' quoting function accepts [select_vars()] helpers like
+#' [starts_with()]. Instead of a [vars()] selection, you can also
+#' supply an [integerish][rlang::is_integerish] vector of column
+#' positions or a character vector of column names.
#'
-#' # You can provide additional arguments. Those are evaluated only once:
-#' by_species %>% summarise_all(mean, trim = 1)
-#' by_species %>% summarise_at(vars(Petal.Width), mean, trim = 1)
+#' * Verbs suffixed with `_if()` apply an operation on the subset of
+#' variables for which a predicate function returns `TRUE`. Instead
+#' of a predicate function, you can also supply a logical vector.
#'
-#' # You can provide an expression or multiple functions with the funs() helper.
-#' by_species %>% mutate_all(funs(. * 0.4))
-#' by_species %>% summarise_all(funs(min, max))
-#' # Note that output variable name must now include function name, in order to
-#' # keep things distinct.
+#' @param .tbl A `tbl` object.
+#' @param .funs List of function calls generated by [funs()], or a
+#' character vector of function names, or simply a function.
#'
-#' # Function names will be included if .funs has names or whenever multiple
-#' # functions are used.
-#' by_species %>% mutate_all(funs("in" = . / 2.54))
-#' by_species %>% mutate_all(funs(rg = diff(range(.))))
-#' by_species %>% summarise_all(funs(med = median))
-#' by_species %>% summarise_all(funs(Q3 = quantile), probs = 0.75)
-#' by_species %>% summarise_all(c("min", "max"))
+#' Bare formulas are passed to [rlang::as_function()] to create
+#' purrr-style lambda functions. Note that these lambda prevent
+#' hybrid evaluation from happening and it is thus more efficient to
+#' supply functions like `mean()` directly rather than in a
+#' lambda-formula.
+#' @param .vars A list of columns generated by [vars()],
+#' or a character vector of column names, or a numeric vector of column
+#' positions.
+#' @param .predicate A predicate function to be applied to the columns
+#' or a logical vector. The variables for which `.predicate` is or
+#' returns `TRUE` are selected. This argument is passed to
+#' [rlang::as_function()] and thus supports quosure-style lambda
+#' functions and strings representing function names.
+#' @param ... Additional arguments for the function calls in
+#' `.funs`. These are evaluated only once, with [explicit
+#' splicing][rlang::dots_list].
+#' @name scoped
+NULL
+
+
+#' Select variables
#'
-#' # Two functions, continued
-#' by_species %>% summarise_at(vars(Petal.Width, Sepal.Width), funs(min, max))
-#' by_species %>% summarise_at(vars(matches("Width")), funs(min, max))
+#' This helper is intended to provide equivalent semantics to
+#' [select()]. It is used for instance in scoped summarising and
+#' mutating verbs ([mutate_at()] and [summarise_at()]).
#'
-#' @aliases summarise_each_q mutate_each_q
+#' Note that verbs accepting a `vars()` specification also accept an
+#' [integerish][rlang::is_integerish] vector of positions or a
+#' character vector of column names.
+#'
+#' @param ... Variables to include/exclude in mutate/summarise. You
+#' can use same specifications as in [select()]. If missing,
+#' defaults to all non-grouping variables.
+#'
+#' These arguments are automatically [quoted][rlang::quo] and later
+#' [evaluated][rlang::eval_tidy] in the context of the data
+#' frame. They support [unquoting][rlang::quasiquotation]. See
+#' `vignette("programming")` for an introduction to these concepts.
+#' @seealso [funs()], [all_vars()] and [any_vars()] for other quoting
+#' functions that you can use with scoped verbs.
#' @export
-summarise_all <- function(.tbl, .funs, ...) {
- funs <- as.fun_list(.funs, .env = parent.frame(), ...)
- vars <- colwise_(.tbl, funs, list())
- summarise_(.tbl, .dots = vars)
+vars <- function(...) {
+ quos(...)
}
-#' @rdname summarise_all
+#' Apply predicate to all variables
+#'
+#' These quoting functions signal to scoped filtering verbs
+#' (e.g. [filter_if()] or [filter_all()]) that a predicate expression
+#' should be applied to all relevant variables. The `all_vars()`
+#' variant takes the intersection of the predicate expressions with
+#' `&` while the `any_vars()` variant takes the union with `|`.
+#'
+#' @param expr A predicate expression. This variable supports
+#' [unquoting][rlang::quasiquotation] and will be evaluated in the
+#' context of the data frame. It should return a logical vector.
+#'
+#' This argument is automatically [quoted][rlang::quo] and later
+#' [evaluated][rlang::eval_tidy] in the context of the data
+#' frame. It supports [unquoting][rlang::quasiquotation]. See
+#' `vignette("programming")` for an introduction to these concepts.
+#' @seealso [funs()] and [vars()] for other quoting functions that you
+#' can use with scoped verbs.
#' @export
-mutate_all <- function(.tbl, .funs, ...) {
- funs <- as.fun_list(.funs, .env = parent.frame(), ...)
- vars <- colwise_(.tbl, funs, list())
- mutate_(.tbl, .dots = vars)
+all_vars <- function(expr) {
+ set_attrs(enquo(expr), class = c("all_vars", "quosure", "formula"))
}
-
-#' @rdname summarise_all
+#' @rdname all_vars
#' @export
-summarise_if <- function(.tbl, .predicate, .funs, ...) {
- if (inherits(.tbl, "tbl_lazy")) {
- stop("Conditional colwise operations currently require local sources",
- call. = FALSE)
- }
- cols <- probe_colwise_names(.tbl, .predicate)
- funs <- as.fun_list(.funs, .env = parent.frame(), ...)
- vars <- colwise_(.tbl, funs, cols)
-
- summarise_(.tbl, .dots = vars)
+any_vars <- function(expr) {
+ set_attrs(enquo(expr), class = c("any_vars", "quosure", "formula"))
}
-
-#' @rdname summarise_all
#' @export
-mutate_if <- function(.tbl, .predicate, .funs, ...) {
- if (inherits(.tbl, "tbl_lazy")) {
- stop("Conditional colwise operations currently require local sources",
- call. = FALSE)
- }
- cols <- probe_colwise_names(.tbl, .predicate)
- funs <- as.fun_list(.funs, .env = parent.frame(), ...)
- vars <- colwise_(.tbl, funs, cols)
-
- mutate_(.tbl, .dots = vars)
-}
-
-probe_colwise_names <- function(tbl, p, ...) {
- if (is.logical(p)) {
- stopifnot(length(p) == length(tbl))
- selected <- p
- } else {
- selected <- vapply(tbl, p, logical(1), ...)
- }
-
- vars <- tbl_vars(tbl)
- vars[selected]
-}
-
-#' @rdname summarise_all
-#' @export
-summarise_at <- function(.tbl, .cols, .funs, ...) {
- cols <- select_colwise_names(.tbl, .cols)
- funs <- as.fun_list(.funs, .env = parent.frame(), ...)
- vars <- colwise_(.tbl, funs, cols)
-
- summarise_(.tbl, .dots = vars)
+print.all_vars <- function(x, ...) {
+ cat("<predicate intersection>\n")
+ NextMethod()
}
-
-#' @rdname summarise_all
#' @export
-mutate_at <- function(.tbl, .cols, .funs, ...) {
- cols <- select_colwise_names(.tbl, .cols)
- funs <- as.fun_list(.funs, .env = parent.frame(), ...)
- vars <- colwise_(.tbl, funs, cols)
-
- mutate_(.tbl, .dots = vars)
+print.any_vars <- function(x, ...) {
+ cat("<predicate union>\n")
+ NextMethod()
}
-#' @rdname summarise_all
-#' @export
-summarize_all <- summarise_all
-
-#' @rdname summarise_all
-#' @export
-summarize_at <- summarise_at
-
-#' @rdname summarise_all
-#' @export
-summarize_if <- summarise_if
-
-#' Select columns
-#'
-#' This helper has equivalent semantics to \code{\link{select}()}. Its
-#' purpose is to provide \code{select()} semantics to the colwise
-#' summarising and mutating verbs.
-#' @param ... Variables to include/exclude in mutate/summarise. You
-#' can use same specifications as in \code{\link{select}}. If
-#' missing, defaults to all non-grouping variables.
-#' @seealso \code{\link{summarise_all}()}
-#' @export
-vars <- function(...) {
- structure(lazyeval::lazy_dots(...),
- class = c("col_list", "lazy_dots")
- )
-}
-is_col_list <- function(cols) inherits(cols, "col_list")
-select_colwise_names <- function(tbl, cols) {
- vars <- tbl_vars(tbl)
+# Requires tbl_vars() method
+tbl_at_vars <- function(tbl, vars) {
+ tibble_vars <- tbl_nongroup_vars(tbl)
- if (is.character(cols) || is_col_list(cols)) {
- selected <- cols
- } else if (is.numeric(cols)) {
- selected <- vars[cols]
+ if (is_character(vars)) {
+ vars
+ } else if (is_integerish(vars)) {
+ tibble_vars[vars]
+ } else if (is_quosures(vars)) {
+ out <- select_vars(tibble_vars, !!! vars)
+ if (!any(have_name(vars))) {
+ names(out) <- NULL
+ }
+ out
} else {
- stop(".cols should be a character/numeric vector or a columns object",
- call. = FALSE)
+ bad_args(".vars", "must be a character/numeric vector or a `vars()` object, ",
+ "not {type_of(vars)}"
+ )
}
-
- selected
+}
+tbl_at_syms <- function(tbl, vars) {
+ vars <- tbl_at_vars(tbl, vars)
+ set_names(syms(vars), names(vars))
}
-colwise_ <- function(tbl, calls, vars) {
- stopifnot(is.fun_list(calls))
-
- named_calls <- attr(calls, "has_names")
- named_vars <- any(has_names(vars))
+# Requires tbl_vars(), `[[`() and length() methods
+tbl_if_vars <- function(.tbl, .p, .env, ...) {
+ vars <- tbl_nongroup_vars(.tbl)
- if (length(vars) == 0) {
- vars <- lazyeval::lazy_dots(everything())
+ if (is_logical(.p)) {
+ stopifnot(length(.p) == length(vars))
+ return(syms(vars[.p]))
}
- vars <- select_vars_(tbl_vars(tbl), vars, exclude = as.character(groups(tbl)))
-
- out <- vector("list", length(vars) * length(calls))
- dim(out) <- c(length(vars), length(calls))
- vars <- enc2native(vars)
- for (i in seq_along(vars)) {
- for (j in seq_along(calls)) {
- out[[i, j]] <- lazyeval::interp(calls[[j]],
- .values = list(. = as.name(vars[i])))
- }
- }
- dim(out) <- NULL
-
- if (length(calls) == 1 && !named_calls) {
- names(out) <- names(vars)
- } else if (length(vars) == 1 && !named_vars) {
- names(out) <- names(calls)
- } else {
- grid <- expand.grid(var = names(vars), call = names(calls))
- names(out) <- paste(grid$var, grid$call, sep = "_")
+ if (inherits(.tbl, "tbl_lazy")) {
+ inform("Applying predicate on the first 100 rows")
+ .tbl <- collect(.tbl, n = 100)
}
- out
-}
-
-#' Summarise and mutate multiple columns.
-#'
-#' Apply one or more functions to one or more columns. Grouping variables
-#' are always excluded from modification.
-#'
-#' In the future \code{mutate_each()} and \code{summarise_each()} will
-#' be deprecated in favour of a more featureful family of functions:
-#' \code{\link{mutate_all}()}, \code{\link{mutate_at}()},
-#' \code{\link{mutate_if}()}, \code{\link{summarise_all}()},
-#' \code{\link{summarise_at}()} and \code{\link{summarise_if}()}.
-#' @param tbl a tbl
-#' @param funs List of function calls, generated by \code{\link{funs}}, or
-#' a character vector of function names.
-#' @param vars,... Variables to include/exclude in mutate/summarise.
-#' You can use same specifications as in \code{\link{select}}. If missing,
-#' defaults to all non-grouping variables.
-#'
-#' For standard evaluation versions (ending in \code{_}) these can
-#' be either a list of expressions or a character vector.
-#' @export
-summarise_each <- function(tbl, funs, ...) {
- summarise_each_(tbl, funs, lazyeval::lazy_dots(...))
-}
-
-#' @export
-#' @rdname summarise_each
-summarise_each_ <- function(tbl, funs, vars) {
- if (is.character(funs)) {
- funs <- funs_(funs)
+ if (is_fun_list(.p)) {
+ if (length(.p) != 1) {
+ bad_args(".predicate", "must have length 1, not {length(.p)}")
+ }
+ .p <- .p[[1]]
}
+ .p <- as_function(.p, .env)
- vars <- colwise_(tbl, funs, vars)
- summarise_(tbl, .dots = vars)
-}
-
-#' @rdname summarise_each
-#' @export
-summarize_each <- summarise_each
-
-#' @rdname summarise_each
-#' @export
-summarize_each_ <- summarise_each_
-
-#' @export
-#' @rdname summarise_each
-mutate_each <- function(tbl, funs, ...) {
- if (is.character(funs)) {
- funs <- funs_(funs)
+ n <- length(vars)
+ selected <- lgl_len(n)
+ for (i in seq_len(n)) {
+ selected[[i]] <- .p(.tbl[[vars[[i]]]], ...)
}
- mutate_each_(tbl, funs, lazyeval::lazy_dots(...))
-}
-
-#' @export
-#' @rdname summarise_each
-mutate_each_ <- function(tbl, funs, vars) {
- vars <- colwise_(tbl, funs, vars)
- mutate_(tbl, .dots = vars)
-}
-
-
-#' @export
-summarise_each_q <- function(...) {
- .Deprecated("summarise_all")
- summarise_each_(...)
+ vars[selected]
}
-#' @export
-mutate_each_q <- function(...) {
- .Deprecated("mutate_all")
- mutate_each_(...)
+tbl_if_syms <- function(.tbl, .p, .env, ...) {
+ syms(tbl_if_vars(.tbl, .p, .env, ...))
}
diff --git a/R/compat-dbplyr.R b/R/compat-dbplyr.R
new file mode 100644
index 0000000..98aba62
--- /dev/null
+++ b/R/compat-dbplyr.R
@@ -0,0 +1,72 @@
+#' dbplyr compatibility functions
+#'
+#' @description
+#' In dplyr 0.6.0, a number of database and SQL functions moved from dplyr to
+#' dbplyr. The generic functions stayed in dplyr (since there is no easy way
+#' to conditionally import a generic from different packages), but many other
+#' SQL and database helper functions moved. If you have written a backend,
+#' these functions generate the code you need to work with both dplyr 0.5.0
+#' dplyr 0.6.0.
+#'
+#' @keywords internal
+#' @export
+#' @examples
+#' if (requireNamespace("dbplyr", quietly = TRUE)) {
+#' wrap_dbplyr_obj("build_sql")
+#' wrap_dbplyr_obj("base_agg")
+#' }
+check_dbplyr <- function() {
+ check_pkg("dbplyr", "communicate with database backends", install = FALSE)
+}
+
+#' @export
+#' @rdname check_dbplyr
+wrap_dbplyr_obj <- function(obj_name) {
+ # Silence R CMD check NOTE
+ `UQ<-` <- NULL
+
+ obj <- getExportedValue("dbplyr", obj_name)
+ obj_sym <- sym(obj_name)
+
+ dbplyr_sym <- lang("::", quote(dbplyr), obj_sym)
+ dplyr_sym <- lang("::", quote(dplyr), obj_sym)
+
+ if (is.function(obj)) {
+ args <- formals()
+ pass_on <- map(set_names(names(args)), sym)
+
+ dbplyr_call <- expr(UQ(dbplyr_sym)(!!!pass_on))
+ dplyr_call <- expr(UQ(dplyr_sym)(!!!pass_on))
+ } else {
+ args <- list()
+
+ dbplyr_call <- dbplyr_sym
+ dplyr_call <- dplyr_sym
+ }
+
+ body <- expr({
+ if (utils::packageVersion("dplyr") > "0.5.0") {
+ dplyr::check_dbplyr()
+ UQ(dbplyr_call)
+ } else {
+ UQ(dplyr_call)
+ }
+ })
+ wrapper <- new_function(args, body, caller_env())
+
+ expr(UQ(obj_sym) <- UQE(wrapper))
+}
+
+#' @inherit dbplyr::sql
+#' @export
+sql <- function(...) {
+ check_dbplyr()
+ dbplyr::sql(...)
+}
+
+#' @inherit dbplyr::ident
+#' @export
+ident <- function(...) {
+ check_dbplyr()
+ dbplyr::ident(...)
+}
diff --git a/R/compat-lazyeval.R b/R/compat-lazyeval.R
new file mode 100644
index 0000000..7fb3b38
--- /dev/null
+++ b/R/compat-lazyeval.R
@@ -0,0 +1,90 @@
+# nocov - compat-lazyeval (last updated: rlang 0.0.0.9018)
+
+# This file serves as a reference for compatibility functions for lazyeval.
+# Please find the most recent version in rlang's repository.
+
+
+warn_underscored <- function() {
+ return(NULL)
+ warn(paste(
+ "The underscored versions are deprecated in favour of",
+ "tidy evaluation idioms. Please see the documentation",
+ "for `quo()` in rlang"
+ ))
+}
+warn_text_se <- function() {
+ return(NULL)
+ warn("Text parsing is deprecated, please supply an expression or formula")
+}
+
+compat_lazy <- function(lazy, env = caller_env(), warn = TRUE) {
+ if (warn) warn_underscored()
+
+ if (missing(lazy)) {
+ return(quo())
+ }
+
+ coerce_type(lazy, "a quosure",
+ formula = as_quosure(lazy, env),
+ symbol = ,
+ language = new_quosure(lazy, env),
+ string = ,
+ character = {
+ if (warn) warn_text_se()
+ parse_quosure(lazy[[1]], env)
+ },
+ logical = ,
+ integer = ,
+ double = {
+ if (length(lazy) > 1) {
+ warn("Truncating vector to length 1")
+ lazy <- lazy[[1]]
+ }
+ new_quosure(lazy, env)
+ },
+ list =
+ coerce_class(lazy, "a quosure",
+ lazy = new_quosure(lazy$expr, lazy$env)
+ )
+ )
+}
+
+compat_lazy_dots <- function(dots, env, ..., .named = FALSE) {
+ if (missing(dots)) {
+ dots <- list()
+ }
+ if (inherits(dots, c("lazy", "formula"))) {
+ dots <- list(dots)
+ } else {
+ dots <- unclass(dots)
+ }
+ dots <- c(dots, list(...))
+
+ warn <- TRUE
+ for (i in seq_along(dots)) {
+ dots[[i]] <- compat_lazy(dots[[i]], env, warn)
+ warn <- FALSE
+ }
+
+ named <- have_name(dots)
+ if (.named && any(!named)) {
+ nms <- map_chr(dots[!named], f_text)
+ names(dots)[!named] <- nms
+ }
+
+ names(dots) <- names2(dots)
+ dots
+}
+
+compat_as_lazy <- function(quo) {
+ structure(class = "lazy", list(
+ expr = f_rhs(quo),
+ env = f_env(quo)
+ ))
+}
+compat_as_lazy_dots <- function(...) {
+ structure(class = "lazy_dots", map(quos(...), compat_as_lazy))
+}
+
+
+# nocov end
diff --git a/R/compat-purrr.R b/R/compat-purrr.R
new file mode 100644
index 0000000..ac276a4
--- /dev/null
+++ b/R/compat-purrr.R
@@ -0,0 +1,161 @@
+# nocov - compat-purrr (last updated: rlang 0.0.0.9007)
+
+# This file serves as a reference for compatibility functions for
+# purrr. They are not drop-in replacements but allow a similar style
+# of programming. This is useful in cases where purrr is too heavy a
+# package to depend on. Please find the most recent version in rlang's
+# repository.
+
+map <- function(.x, .f, ...) {
+ lapply(.x, .f, ...)
+}
+map_mold <- function(.x, .f, .mold, ...) {
+ out <- vapply(.x, .f, .mold, ..., USE.NAMES = FALSE)
+ rlang::set_names(out, names(.x))
+}
+map_lgl <- function(.x, .f, ...) {
+ map_mold(.x, .f, logical(1), ...)
+}
+map_int <- function(.x, .f, ...) {
+ map_mold(.x, .f, integer(1), ...)
+}
+map_dbl <- function(.x, .f, ...) {
+ map_mold(.x, .f, double(1), ...)
+}
+map_chr <- function(.x, .f, ...) {
+ map_mold(.x, .f, character(1), ...)
+}
+map_cpl <- function(.x, .f, ...) {
+ map_mold(.x, .f, complex(1), ...)
+}
+
+pluck <- function(.x, .f) {
+ map(.x, `[[`, .f)
+}
+pluck_lgl <- function(.x, .f) {
+ map_lgl(.x, `[[`, .f)
+}
+pluck_int <- function(.x, .f) {
+ map_int(.x, `[[`, .f)
+}
+pluck_dbl <- function(.x, .f) {
+ map_dbl(.x, `[[`, .f)
+}
+pluck_chr <- function(.x, .f) {
+ map_chr(.x, `[[`, .f)
+}
+pluck_cpl <- function(.x, .f) {
+ map_cpl(.x, `[[`, .f)
+}
+
+map2 <- function(.x, .y, .f, ...) {
+ Map(.f, .x, .y, ...)
+}
+map2_lgl <- function(.x, .y, .f, ...) {
+ as.vector(map2(.x, .y, .f, ...), "logical")
+}
+map2_int <- function(.x, .y, .f, ...) {
+ as.vector(map2(.x, .y, .f, ...), "integer")
+}
+map2_dbl <- function(.x, .y, .f, ...) {
+ as.vector(map2(.x, .y, .f, ...), "double")
+}
+map2_chr <- function(.x, .y, .f, ...) {
+ as.vector(map2(.x, .y, .f, ...), "character")
+}
+map2_cpl <- function(.x, .y, .f, ...) {
+ as.vector(map2(.x, .y, .f, ...), "complex")
+}
+
+args_recycle <- function(args) {
+ lengths <- map_int(args, length)
+ n <- max(lengths)
+
+ stopifnot(all(lengths == 1L | lengths == n))
+ to_recycle <- lengths == 1L
+ args[to_recycle] <- map(args[to_recycle], function(x) rep.int(x, n))
+
+ args
+}
+pmap <- function(.l, .f, ...) {
+ args <- args_recycle(.l)
+ do.call("mapply", c(
+ FUN = list(quote(.f)),
+ args, MoreArgs = quote(list(...)),
+ SIMPLIFY = FALSE, USE.NAMES = FALSE
+ ))
+}
+
+probe <- function(.x, .p, ...) {
+ if (is_logical(.p)) {
+ stopifnot(length(.p) == length(.x))
+ .p
+ } else {
+ map_lgl(.x, .p, ...)
+ }
+}
+
+keep <- function(.x, .f, ...) {
+ .x[probe(.x, .f, ...)]
+}
+discard <- function(.x, .p, ...) {
+ sel <- probe(.x, .p, ...)
+ .x[is.na(sel) | !sel]
+}
+map_if <- function(.x, .p, .f, ...) {
+ matches <- probe(.x, .p)
+ .x[matches] <- map(.x[matches], .f, ...)
+ .x
+}
+
+compact <- function(.x) {
+ Filter(length, .x)
+}
+
+transpose <- function(.l) {
+ inner_names <- names(.l[[1]])
+ if (is.null(inner_names)) {
+ fields <- seq_along(.l[[1]])
+ } else {
+ fields <- set_names(inner_names)
+ }
+
+ map(fields, function(i) {
+ map(.l, .subset2, i)
+ })
+}
+
+every <- function(.x, .p, ...) {
+ for (i in seq_along(.x)) {
+ if (!rlang::is_true(.p(.x[[i]], ...))) return(FALSE)
+ }
+ TRUE
+}
+some <- function(.x, .p, ...) {
+ for (i in seq_along(.x)) {
+ if (rlang::is_true(.p(.x[[i]], ...))) return(TRUE)
+ }
+ FALSE
+}
+negate <- function(.p) {
+ function(...) !.p(...)
+}
+
+reduce <- function(.x, .f, ..., .init) {
+ f <- function(x, y) .f(x, y, ...)
+ Reduce(f, .x, init = .init)
+}
+reduce_right <- function(.x, .f, ..., .init) {
+ f <- function(x, y) .f(y, x, ...)
+ Reduce(f, .x, init = .init, right = TRUE)
+}
+accumulate <- function(.x, .f, ..., .init) {
+ f <- function(x, y) .f(x, y, ...)
+ Reduce(f, .x, init = .init, accumulate = TRUE)
+}
+accumulate_right <- function(.x, .f, ..., .init) {
+ f <- function(x, y) .f(y, x, ...)
+ Reduce(f, .x, init = .init, right = TRUE, accumulate = TRUE)
+}
+
+# nocov end
diff --git a/R/compute-collect.r b/R/compute-collect.r
index 24cffce..ca8e15b 100644
--- a/R/compute-collect.r
+++ b/R/compute-collect.r
@@ -1,32 +1,37 @@
-#' Compute a lazy tbl.
+#' Force computation of a database query
#'
-#' \code{compute} forces computation of lazy tbls, leaving data in the remote
-#' source. \code{collect} also forces computation, but will bring data back into
-#' an R data.frame (stored in a \code{\link{tbl_df}}). \code{collapse} doesn't
-#' force computation, but collapses a complex tbl into a form that additional
-#' restrictions can be placed on.
+#' `compute()` stores results in a remote temporary table.
+#' `collect()` retrieves data into a local tibble.
+#' `collapse()` is slightly different: it doesn't force computation, but
+#' instead forces generation of the SQL query. This is sometimes needed to work
+#' around bugs in dplyr's SQL generation.
#'
-#' @section Grouping:
+#' All functions preserve grouping and ordering.
#'
-#' \code{compute} and \code{collect} preserve grouping, \code{collapse} drops
-#' it.
-#'
-#' @param x a data tbl
-#' @param name name of temporary table on database.
-#' @param ... other arguments passed on to methods
+#' @param x A tbl
+#' @param name Name of temporary table on database.
+#' @param ... Other arguments passed on to methods
#' @inheritParams copy_to.src_sql
-#' @seealso \code{\link{copy_to}} which is the conceptual opposite: it
-#' takes a local data frame and makes it available to the remote source.
+#' @seealso [copy_to()], the opposite of `collect()`: it takes a local data
+#' frame and uploads it to the remote source.
#' @export
#' @examples
-#' \donttest{
-#' if (require("RSQLite") && has_lahman("sqlite")) {
-#' batting <- tbl(lahman_sqlite(), "Batting")
-#' remote <- select(filter(batting, yearID > 2010 && stint == 1), playerID:H)
-#' remote2 <- collapse(remote)
-#' cached <- compute(remote)
-#' local <- collect(remote)
-#' }
+#' if (require(dbplyr)) {
+#' mtcars2 <- src_memdb() %>%
+#' copy_to(mtcars, name = "mtcars2-cc", overwrite = TRUE)
+#'
+#' remote <- mtcars2 %>%
+#' filter(cyl == 8) %>%
+#' select(mpg:drat)
+#'
+#' # Compute query and save in remote table
+#' compute(remote)
+#'
+#' # Compute query bring back to this session
+#' collect(remote)
+#'
+#' # Creates a fresh query based on the generated SQL
+#' collapse(remote)
#' }
compute <- function(x, name = random_table_name(), ...) {
UseMethod("compute")
diff --git a/R/copy-to.r b/R/copy-to.r
index 95c8068..5c50628 100644
--- a/R/copy-to.r
+++ b/R/copy-to.r
@@ -1,4 +1,4 @@
-#' Copy a local data frame to a remote src.
+#' Copy a local data frame to a remote src
#'
#' This function uploads a local data frame into a remote data source, creating
#' the table definition as needed. Wherever possible, the new object will be
@@ -7,19 +7,30 @@
#' @param dest remote data source
#' @param df local data frame
#' @param name name for new remote table.
+#' @param overwrite If `TRUE`, will overwrite an existing table with
+#' name `name`. If `FALSE`, will throw an error if `name` already
+#' exists.
#' @param ... other parameters passed to methods.
-#' @return a \code{tbl} object in the remote source
+#' @seealso [collect()] for the opposite action; downloading remote data into
+#' a local dbl.
+#' @return a `tbl` object in the remote source
#' @export
-copy_to <- function(dest, df, name = deparse(substitute(df)), ...) {
+#' @examples
+#' \dontrun{
+#' iris2 <- dbplyr::src_memdb() %>% copy_to(iris, overwrite = TRUE)
+#' iris2
+#' }
+copy_to <- function(dest, df, name = deparse(substitute(df)),
+ overwrite = FALSE, ...) {
UseMethod("copy_to")
}
-#' Copy tables to same source, if necessary.
+#' Copy tables to same source, if necessary
#'
-#' @param x,y \code{y} will be copied to \code{x}, if neccessary.
-#' @param copy If \code{x} and \code{y} are not from the same data source,
-#' and \code{copy} is \code{TRUE}, then \code{y} will be copied into the
-#' same src as \code{x}. This allows you to join tables across srcs, but
+#' @param x,y `y` will be copied to `x`, if necessary.
+#' @param copy If `x` and `y` are not from the same data source,
+#' and `copy` is `TRUE`, then `y` will be copied into the
+#' same src as `x`. This allows you to join tables across srcs, but
#' it is a potentially expensive operation so you must opt into it.
#' @param ... Other arguments passed on to methods.
#' @export
@@ -27,8 +38,9 @@ auto_copy <- function(x, y, copy = FALSE, ...) {
if (same_src(x, y)) return(y)
if (!copy) {
- stop("x and y don't share the same src. Set copy = TRUE to copy y into ",
- "x's source (this may be time consuming).", call. = FALSE)
+ glubort(NULL, "`x` and `y` must share the same src, ",
+ "set `copy` = TRUE (may be slow)"
+ )
}
UseMethod("auto_copy")
diff --git a/R/count-tally.R b/R/count-tally.R
new file mode 100644
index 0000000..1798b92
--- /dev/null
+++ b/R/count-tally.R
@@ -0,0 +1,170 @@
+#' Count/tally observations by group
+#'
+#' @description
+#' `tally()` is a convenient wrapper for summarise that will either call
+#' [n()] or \code{\link{sum}(n)} depending on whether you're tallying
+#' for the first time, or re-tallying. `count()` is similar but calls
+#' [group_by()] before and [ungroup()] after.
+#'
+#' `add_tally()` adds a column "n" to a table based on the number
+#' of items within each existing group, while `add_count()` is a shortcut that
+#' does the grouping as well. These functions are to [tally()]
+#' and [count()] as [mutate()] is to [summarise()]:
+#' they add an additional column rather than collapsing each group.
+#'
+#' @note
+#' The column name in the returned data is usually `n`, even if you
+#' have supplied a weight.
+#'
+#' If the data already already has a column named `n`, the output column
+#' will be called `nn`. If the table already has columns called `n` and `nn`
+#' then the column returned will be `nnn`, and so on.
+#'
+#' There is currently no way to control the output variable name - if you
+#' need to change the default, you'll have to write the [summarise()]
+#' yourself.
+#'
+#' @param x a [tbl()] to tally/count.
+#' @param ... Variables to group by.
+#' @param wt (Optional) If omitted, will count the number of rows. If
+#' specified, will perform a "weighted" tally by summing the
+#' (non-missing) values of variable `wt`. This argument is
+#' automatically [quoted][rlang::quo] and later
+#' [evaluated][rlang::eval_tidy] in the context of the data
+#' frame. It supports [unquoting][rlang::quasiquotation]. See
+#' `vignette("programming")` for an introduction to these concepts.
+#' @param sort if `TRUE` will sort output in descending order of `n`
+#' @return A tbl, grouped the same way as `x`.
+#' @export
+#' @examples
+#' # tally() is short-hand for summarise()
+#' mtcars %>% tally()
+#' # count() is a short-hand for group_by() + tally()
+#' mtcars %>% count(cyl)
+#'
+#' # add_tally() is short-hand for mutate()
+#' mtcars %>% add_tally()
+#' # add_count() is a short-hand for group_by() + add_tally()
+#' mtcars %>% add_count(cyl)
+#'
+#' # count and tally are designed so that you can call
+#' # them repeatedly, each time rolling up a level of detail
+#' species <- starwars %>% count(species, homeworld, sort = TRUE)
+#' species
+#' species %>% count(species, sort = TRUE)
+#'
+#' # add_count() is useful for groupwise filtering
+#' # e.g.: show only species that have a single member
+#' starwars %>%
+#' add_count(species) %>%
+#' filter(n == 1)
+tally <- function(x, wt, sort = FALSE) {
+ wt <- enquo(wt)
+
+ if (quo_is_missing(wt) && "n" %in% names(x)) {
+ inform("Using `n` as weighting variable")
+ wt <- quo(n)
+ }
+
+ if (quo_is_missing(wt) || quo_is_null(wt)) {
+ n <- quo(n())
+ } else {
+ n <- quo(sum(!! wt, na.rm = TRUE))
+ }
+
+ n_name <- n_name(tbl_vars(x))
+ out <- summarise(x, !! n_name := !! n)
+
+ if (sort) {
+ arrange(out, desc(!! sym(n_name)))
+ } else {
+ out
+ }
+}
+#' @rdname se-deprecated
+#' @inheritParams tally
+#' @export
+tally_ <- function(x, wt, sort = FALSE) {
+ wt <- compat_lazy(wt, caller_env())
+ tally(x, wt = !! wt, sort = sort)
+}
+
+n_name <- function(x) {
+ name <- "n"
+ while (name %in% x) {
+ name <- paste0(name, "n")
+ }
+
+ name
+
+}
+
+#' @export
+#' @rdname tally
+count <- function(x, ..., wt = NULL, sort = FALSE) {
+ groups <- group_vars(x)
+
+ x <- group_by(x, ..., add = TRUE)
+ x <- tally(x, wt = !! enquo(wt), sort = sort)
+ x <- group_by(x, !!! syms(groups), add = FALSE)
+ x
+}
+#' @export
+#' @rdname se-deprecated
+count_ <- function(x, vars, wt = NULL, sort = FALSE) {
+ vars <- compat_lazy_dots(vars, caller_env())
+ wt <- wt %||% quo(NULL)
+ wt <- compat_lazy(wt, caller_env())
+ count(x, !!! vars, wt = !! wt, sort = sort)
+}
+
+#' @rdname tally
+#' @export
+add_tally <- function(x, wt, sort = FALSE) {
+ wt <- enquo(wt)
+
+ if (quo_is_missing(wt) && "n" %in% names(x)) {
+ inform("Using `n` as weighting variable")
+ wt <- quo(n)
+ }
+
+ if (quo_is_missing(wt) || quo_is_null(wt)) {
+ n <- quo(n())
+ } else {
+ n <- quo(sum(!! wt, na.rm = TRUE))
+ }
+
+ n_name <- n_name(tbl_vars(x))
+ out <- mutate(x, !! n_name := !! n)
+
+ if (sort) {
+ out <- arrange(out, desc(!! sym(n_name)))
+ }
+
+ grouped_df(out, group_vars(x))
+}
+#' @rdname se-deprecated
+#' @export
+add_tally_ <- function(x, wt, sort = FALSE) {
+ wt <- compat_lazy(wt, caller_env())
+ add_tally(x, !! wt, sort = sort)
+}
+
+
+#' @rdname tally
+#' @export
+add_count <- function(x, ..., wt = NULL, sort = FALSE) {
+ g <- group_vars(x)
+ grouped <- group_by(x, ..., add = TRUE)
+
+ out <- add_tally(grouped, wt = !! enquo(wt), sort = sort)
+ grouped_df(out, g)
+}
+#' @rdname se-deprecated
+#' @export
+add_count_ <- function(x, vars, wt = NULL, sort = FALSE) {
+ vars <- compat_lazy_dots(vars, caller_env())
+ wt <- wt %||% quo(NULL)
+ wt <- compat_lazy(wt, caller_env())
+ add_count(x, !!! vars, wt = !! wt, sort = sort)
+}
diff --git a/R/data-bands.R b/R/data-bands.R
new file mode 100644
index 0000000..5694b2c
--- /dev/null
+++ b/R/data-bands.R
@@ -0,0 +1,23 @@
+#' Band membership
+#'
+#' These data sets describe band members of the Beatles and Rolling Stones. They
+#' are toy data sets that can be displayed in their entirety on a slide (e.g. to
+#' demonstrate a join).
+#'
+#' `band_instruments` and `band_instruments2` contain the same data but use
+#' different column names for the first column of the data set.
+#' `band_instruments` uses `name`, which matches the name of the key column of
+#' `band_members`; `band_instruments2` uses `artist`, which does not.
+#'
+#' @format Each is a tibble with two variables and three observations
+#' @examples
+#' band_members
+#' band_instruments
+#' band_instruments2
+"band_members"
+
+#' @rdname band_members
+"band_instruments"
+
+#' @rdname band_members
+"band_instruments2"
diff --git a/R/data-lahman.r b/R/data-lahman.r
deleted file mode 100644
index 1eb31a9..0000000
--- a/R/data-lahman.r
+++ /dev/null
@@ -1,110 +0,0 @@
-#' Cache and retrieve an \code{src_sqlite} of the Lahman baseball database.
-#'
-#' This creates an interesting database using data from the Lahman baseball
-#' data source, provided by Sean Lahman at
-#' \url{http://www.seanlahman.com/baseball-archive/statistics/}, and
-#' made easily available in R through the \pkg{Lahman} package by
-#' Michael Friendly, Dennis Murphy and Martin Monkman. See the documentation
-#' for that package for documentation of the inidividual tables.
-#'
-#' @param ... Other arguments passed to \code{src} on first
-#' load. For mysql and postgresql, the defaults assume you have a local
-#' server with \code{lahman} database already created.
-#' For \code{lahman_srcs}, character vector of names giving srcs to generate.
-#' @param quiet if \code{TRUE}, suppress messages about databases failing to
-#' connect.
-#' @param type src type.
-#' @keywords internal
-#' @examples
-#' # Connect to a local sqlite database, if already created
-#' \donttest{
-#' if (has_lahman("sqlite")) {
-#' lahman_sqlite()
-#' batting <- tbl(lahman_sqlite(), "Batting")
-#' batting
-#' }
-#'
-#' # Connect to a local postgres database with lahman database, if available
-#' if (has_lahman("postgres")) {
-#' lahman_postgres()
-#' batting <- tbl(lahman_postgres(), "Batting")
-#' }
-#' }
-#' @name lahman
-NULL
-
-#' @export
-#' @rdname lahman
-lahman_sqlite <- function(path = NULL) {
- path <- db_location(path, "lahman.sqlite")
- copy_lahman(src_sqlite(path = path, create = TRUE))
-}
-
-#' @export
-#' @rdname lahman
-lahman_postgres <- function(dbname = "lahman", ...) {
- copy_lahman(src_postgres(dbname, ...))
-}
-
-#' @export
-#' @rdname lahman
-lahman_mysql <- function(dbname = "lahman", ...) {
- copy_lahman(src_mysql(dbname, ...))
-}
-
-#' @export
-#' @rdname lahman
-lahman_df <- function() {
- src_df("Lahman")
-}
-
-#' @export
-#' @rdname lahman
-lahman_dt <- function() {
- dtplyr::src_dt("Lahman")
-}
-
-#' @rdname lahman
-#' @export
-copy_lahman <- function(src, ...) {
- # Create missing tables
- tables <- setdiff(lahman_tables(), src_tbls(src))
- for(table in tables) {
- df <- getExportedValue("Lahman", table)
- message("Creating table: ", table)
-
- ids <- as.list(names(df)[grepl("ID$", names(df))])
- copy_to(src, df, table, indexes = ids, temporary = FALSE)
- }
-
- src
-}
-# Get list of all non-label data frames in package
-lahman_tables <- function() {
- tables <- utils::data(package = "Lahman")$results[, 3]
- tables[!grepl("Labels", tables)]
-}
-
-#' @rdname lahman
-#' @export
-has_lahman <- function(type, ...) {
- if (!requireNamespace("Lahman", quietly = TRUE)) return(FALSE)
- if (missing(type)) return(TRUE)
-
- succeeds(lahman(type, ...), quiet = TRUE)
-}
-
-#' @rdname lahman
-#' @export
-lahman_srcs <- function(..., quiet = NULL) {
- load_srcs(lahman, c(...), quiet = quiet)
-}
-
-lahman <- function(type, ...) {
- if (missing(type)) {
- src_df("Lahman")
- } else {
- f <- match.fun(paste0("lahman_", type))
- f(...)
- }
-}
diff --git a/R/data-nasa.r b/R/data-nasa.r
index 2f8953c..82eb88c 100644
--- a/R/data-nasa.r
+++ b/R/data-nasa.r
@@ -12,23 +12,23 @@
#' @section Dimensions:
#'
#' \itemize{
-#' \item \code{lat}, \code{long}: latitude and longitude
-#' \item \code{year}, \code{month}: month and year
+#' \item `lat`, `long`: latitude and longitude
+#' \item `year`, `month`: month and year
#' }
#'
#' @section Measures:
#'
#' \itemize{
-#' \item \code{cloudlow}, \code{cloudmed}, \code{cloudhigh}: cloud cover
+#' \item `cloudlow`, `cloudmed`, `cloudhigh`: cloud cover
#' at three heights
-#' \item \code{ozone}
-#' \item \code{surftemp} and \code{temperature}
-#' \item \code{pressure}
+#' \item `ozone`
+#' \item `surftemp` and `temperature`
+#' \item `pressure`
#' }
#' @docType data
#' @name nasa
#' @usage nasa
-#' @format A \code{\link{tbl_cube}} with 41,472 observations.
+#' @format A [tbl_cube] with 41,472 observations.
#' @examples
#' nasa
NULL
diff --git a/R/data-nycflights13.r b/R/data-nycflights13.r
deleted file mode 100644
index 4e46fd7..0000000
--- a/R/data-nycflights13.r
+++ /dev/null
@@ -1,72 +0,0 @@
-#' Database versions of the nycflights13 data
-#'
-#' These functions cache the data from the \code{nycflights13} database in
-#' a local database, for use in examples and vignettes. Indexes are created
-#' to making joining tables on natural keys efficient.
-#'
-#' @keywords internal
-#' @name nycflights13
-NULL
-
-#' @export
-#' @rdname nycflights13
-#' @param path location of sqlite database file
-nycflights13_sqlite <- function(path = NULL) {
- cache_computation("nycflights_sqlite", {
- path <- db_location(path, "nycflights13.sqlite")
- message("Caching nycflights db at ", path)
- src <- src_sqlite(path, create = TRUE)
- copy_nycflights13(src)
- })
-}
-
-#' @export
-#' @rdname nycflights13
-#' @param dbname,... Arguments passed on to \code{\link{src_postgres}}
-nycflights13_postgres <- function(dbname = "nycflights13", ...) {
- cache_computation("nycflights_postgres", {
- message("Caching nycflights db in postgresql db ", dbname)
- copy_nycflights13(src_postgres(dbname, ...))
- })
-}
-
-#' @rdname nycflights13
-#' @export
-has_nycflights13 <- function(type = c("sqlite", "postgresql"), ...) {
- if (!requireNamespace("nycflights13", quietly = TRUE)) return(FALSE)
-
- type <- match.arg(type)
-
- succeeds(switch(type,
- sqlite = nycflights13_sqlite(...), quiet = TRUE,
- postgres = nycflights13_postgres(...), quiet = TRUE
- ))
-}
-
-
-#' @export
-#' @rdname nycflights13
-copy_nycflights13 <- function(src, ...) {
- all <- utils::data(package = "nycflights13")$results[, 3]
- unique_index <- list(
- airlines = list("carrier"),
- planes = list("tailnum")
- )
- index <- list(
- airports = list("faa"),
- flights = list(c("year", "month", "day"), "carrier", "tailnum", "origin", "dest"),
- weather = list(c("year", "month", "day"), "origin")
- )
-
- tables <- setdiff(all, src_tbls(src))
-
- # Create missing tables
- for(table in tables) {
- df <- getExportedValue("nycflights13", table)
- message("Creating table: ", table)
-
- copy_to(src, df, table, unique_indexes = unique_index[[table]],
- indexes = index[[table]], temporary = FALSE)
- }
- src
-}
diff --git a/R/data-starwars.R b/R/data-starwars.R
new file mode 100644
index 0000000..ed7d9ea
--- /dev/null
+++ b/R/data-starwars.R
@@ -0,0 +1,21 @@
+#' Starwars characters
+#'
+#' This data comes from SWAPI, the Star Wars API, <http://swapi.co/>
+#'
+#' @format A tibble with 87 rows and 13 variables:
+#' \describe{
+#' \item{name}{Name of the character}
+#' \item{height}{Height (cm)}
+#' \item{mass}{Weight (kg)}
+#' \item{hair_color,skin_color,eye_color}{Hair, skin, and eye colors}
+#' \item{birth_year}{Year born (BBY = Before Battle of Yavin)}
+#' \item{gender}{male, female, hermaphrodite, or none.}
+#' \item{homeworld}{Name of homeworld}
+#' \item{species}{Name of species}
+#' \item{films}{List of films the character appeared in}
+#' \item{vehicles}{List of vehicles the character has piloted}
+#' \item{starships}{List of starships the character has piloted}
+#' }
+#' @examples
+#' starwars
+"starwars"
diff --git a/R/data-storms.R b/R/data-storms.R
new file mode 100644
index 0000000..b51f0d6
--- /dev/null
+++ b/R/data-storms.R
@@ -0,0 +1,25 @@
+#' Storm tracks data
+#'
+#' This data is a subset of the NOAA Atlantic hurricane database best track
+#' data, \url{http://www.nhc.noaa.gov/data/#hurdat}. The data includes the
+#' positions and attributes of 198 tropical storms, measured every six hours
+#' during the lifetime of a storm.
+#'
+#' @format A tibble with 10,010 observations and 13 variables:
+#' \describe{
+#' \item{name}{Storm Name}
+#' \item{year,month,day}{Date of report}
+#' \item{hour}{Hour of report (in UTC)}
+#' \item{lat,long}{Location of storm center}
+#' \item{status}{Storm classification (Tropical Depression, Tropical Storm,
+#' or Hurricane)}
+#' \item{category}{Saffir-Simpson storm category (estimated from wind speed.
+#' -1 = Tropical Depression, 0 = Tropical Storm)}
+#' \item{wind}{storm's maximum sustained wind speed (in knots)}
+#' \item{pressure}{Air pressure at the storm's center (in millibars)}
+#' \item{ts_diameter}{Diameter of the area experiencing tropical storm strength winds (34 knots or above)}
+#' \item{hu_diameter}{Diameter of the area experiencing hurricane strength winds (64 knots or above)}
+#' }
+#' @examples
+#' storms
+"storms"
diff --git a/R/data-temp.r b/R/data-temp.r
deleted file mode 100644
index ae5efc5..0000000
--- a/R/data-temp.r
+++ /dev/null
@@ -1,72 +0,0 @@
-#' Infrastructure for testing dplyr
-#'
-#' Register testing sources, then use \code{test_load} to load an existing
-#' data frame into each source. To create a new table in each source,
-#' use \code{test_frame}.
-#'
-#' @keywords internal
-#' @examples
-#' \dontrun{
-#' test_register_src("df", src_df(env = new.env()))
-#' test_register_src("sqlite", src_sqlite(":memory:", create = TRUE))
-#'
-#' test_frame(x = 1:3, y = 3:1)
-#' test_load(mtcars)
-#' }
-#' @name testing
-NULL
-
-
-#' @export
-#' @rdname testing
-test_register_src <- function(name, src) {
- message("Registering testing src: ", name, " ", appendLF = FALSE)
- tryCatch(
- {
- test_srcs$add(name, src)
- message("OK")
- },
- error = function(e) message(conditionMessage(e))
- )
-}
-
-#' @export
-#' @rdname testing
-test_load <- function(df, name = random_table_name(), srcs = test_srcs$get(),
- ignore = character()) {
- stopifnot(is.data.frame(df))
- stopifnot(is.character(ignore))
-
- srcs <- srcs[setdiff(names(srcs), ignore)]
- lapply(srcs, copy_to, df, name = name)
-}
-
-#' @export
-#' @rdname testing
-test_frame <- function(..., srcs = test_srcs$get(), ignore = character()) {
- df <- data_frame(...)
- test_load(df, srcs = srcs, ignore = ignore)
-}
-
-# Manage cache of testing srcs
-test_srcs <- local({
- e <- new.env(parent = emptyenv())
- e$srcs <- list()
-
- list(
- get = function() e$srcs,
-
- has = function(x) x %in% names(e$srcs),
-
- add = function(name, src) {
- stopifnot(is.src(src))
- e$srcs[[name]] <- src
- },
-
- set = function(...) {
- old <- e$srcs
- e$srcs <- list(...)
- invisible(old)
- }
- )
-})
diff --git a/R/data.r b/R/data.r
deleted file mode 100644
index afc4f32..0000000
--- a/R/data.r
+++ /dev/null
@@ -1,65 +0,0 @@
-# Environment for caching connections etc
-cache <- new.env(parent = emptyenv())
-
-is_cached <- function(name) exists(name, envir = cache)
-set_cache <- function(name, value) {
-# message("Setting ", name, " in cache")
- assign(name, value, envir = cache)
- value
-}
-get_cache <- function(name) {
-# message("Getting ", name, " from cache")
- get(name, envir = cache)
-}
-
-cache_computation <- function(name, computation) {
- if (is_cached(name)) {
- get_cache(name)
- } else {
- res <- force(computation)
- set_cache(name, res)
- res
- }
-}
-
-load_srcs <- function(f, src_names, quiet = NULL) {
- if (is.null(quiet)) {
- quiet <- !identical(Sys.getenv("NOT_CRAN"), "true")
- }
-
-
- srcs <- lapply(src_names, function(x) {
- out <- NULL
- try(out <- f(x), silent = TRUE)
- if (is.null(out) && !quiet) {
- message("Could not instantiate ", x, " src")
- }
- out
- })
-
- compact(setNames(srcs, src_names))
-}
-
-
-db_location <- function(path = NULL, filename) {
- if (!is.null(path)) {
- # Check that path is a directory and is writeable
- if (!file.exists(path) || !file.info(path)$isdir) {
- stop(path, " is not a directory", call. = FALSE)
- }
- if (!is_writeable(path)) stop("Can not write to ", path, call. = FALSE)
- return(file.path(path, filename))
- }
-
- pkg <- file.path(system.file("db", package = "dplyr"))
- if (is_writeable(pkg)) return(file.path(pkg, filename))
-
- tmp <- tempdir()
- if (is_writeable(tmp)) return(file.path(tmp, filename))
-
- stop("Could not find writeable location to cache db", call. = FALSE)
-}
-
-is_writeable <- function(x) {
- unname(file.access(x, 2) == 0)
-}
diff --git a/R/dataframe.R b/R/dataframe.R
index a0b5e12..bed81c8 100644
--- a/R/dataframe.R
+++ b/R/dataframe.R
@@ -1,28 +1,12 @@
# Grouping methods ------------------------------------------------------------
-#' Convert to a data frame
-#'
-#' Functions that convert the input to a \code{data_frame}.
-#'
-#' @details For a grouped data frame, the \code{\link[tibble]{as_data_frame}}
-#' S3 generic simply removes the grouping.
-#'
-#' @inheritParams tibble::as_data_frame
-#' @seealso \code{\link[tibble]{as_data_frame}}
-#' @name grouped_df
-#' @export
-as_data_frame.grouped_df <- function(x, ...) {
- x <- ungroup(x)
- class(x) <- c("tbl_df", "tbl", "data.frame")
- x
-}
-
#' Convert row names to an explicit variable.
#'
-#' Deprecated, use \code{\link[tibble]{rownames_to_column}} instead.
+#' Deprecated, use [tibble::rownames_to_column()] instead.
#'
#' @param df Input data frame with rownames.
#' @param var Name of variable to use
+#' @keywords internal
#' @export
#' @examples
#' mtcars %>% tbl_df()
@@ -44,9 +28,14 @@ add_rownames <- function(df, var = "rowname") {
# Grouping methods ------------------------------------------------------------
#' @export
-group_by_.data.frame <- function(.data, ..., .dots, add = FALSE) {
- groups <- group_by_prepare(.data, ..., .dots = .dots, add = add)
- grouped_df(groups$data, groups$groups)
+group_by.data.frame <- function(.data, ..., add = FALSE) {
+ groups <- group_by_prepare(.data, ..., add = add)
+ grouped_df(groups$data, groups$group_names)
+}
+#' @export
+group_by_.data.frame <- function(.data, ..., .dots = list(), add = FALSE) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
+ group_by(.data, !!! dots, add = add)
}
#' @export
@@ -67,42 +56,78 @@ n_groups.data.frame <- function(x) 1L
# is just a convenience layer, I didn't bother. They should still be fast.
#' @export
-filter_.data.frame <- function(.data, ..., .dots) {
- dots <- lazyeval::all_dots(.dots, ...)
- as.data.frame(filter_(tbl_df(.data), .dots = dots))
+filter.data.frame <- function(.data, ...) {
+ as.data.frame(filter(tbl_df(.data), ...))
}
#' @export
-slice_.data.frame <- function(.data, ..., .dots) {
- dots <- lazyeval::all_dots(.dots, ..., all_named = TRUE)
- as.data.frame(slice_(tbl_df(.data), .dots = dots))
+filter_.data.frame <- function(.data, ..., .dots = list()) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
+ filter(.data, !!! dots)
}
+
#' @export
-summarise_.data.frame <- function(.data, ..., .dots) {
- dots <- lazyeval::all_dots(.dots, ..., all_named = TRUE)
- as.data.frame(summarise_(tbl_df(.data), .dots = dots))
+slice.data.frame <- function(.data, ...) {
+ dots <- named_quos(...)
+ slice_impl(.data, dots)
}
#' @export
-mutate_.data.frame <- function(.data, ..., .dots) {
- dots <- lazyeval::all_dots(.dots, ..., all_named = TRUE)
- as.data.frame(mutate_(tbl_df(.data), .dots = dots))
+slice_.data.frame <- function(.data, ..., .dots = list()) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
+ slice_impl(.data, dots)
}
+
#' @export
-arrange_.data.frame <- function(.data, ..., .dots) {
- dots <- lazyeval::all_dots(.dots, ..., all_named = TRUE)
- as.data.frame(arrange_(tbl_df(.data), .dots = dots))
+summarise.data.frame <- function(.data, ...) {
+ as.data.frame(summarise(tbl_df(.data), ...))
}
#' @export
-select_.data.frame <- function(.data, ..., .dots) {
- dots <- lazyeval::all_dots(.dots, ...)
- vars <- select_vars_(names(.data), dots)
+summarise_.data.frame <- function(.data, ..., .dots = list()) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
+ summarise(.data, !!! dots)
+}
+
+#' @export
+mutate.data.frame <- function(.data, ...) {
+ as.data.frame(mutate(tbl_df(.data), ...))
+}
+#' @export
+mutate_.data.frame <- function(.data, ..., .dots = list()) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
+ mutate(.data, !!! dots)
+}
+
+#' @export
+arrange.data.frame <- function(.data, ...) {
+ as.data.frame(arrange(tbl_df(.data), ...))
+}
+#' @export
+arrange_.data.frame <- function(.data, ..., .dots = list()) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
+ arrange(.data, !!! dots)
+}
+
+#' @export
+select.data.frame <- function(.data, ...) {
+ # Pass via splicing to avoid matching select_vars() arguments
+ vars <- select_vars(names(.data), !!! quos(...))
select_impl(.data, vars)
}
#' @export
-rename_.data.frame <- function(.data, ..., .dots) {
- dots <- lazyeval::all_dots(.dots, ...)
- vars <- rename_vars_(names(.data), dots)
+select_.data.frame <- function(.data, ..., .dots = list()) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
+ select(.data, !!! dots)
+}
+
+#' @export
+rename.data.frame <- function(.data, ...) {
+ vars <- rename_vars(names(.data), !!! quos(...))
select_impl(.data, vars)
}
+#' @export
+rename_.data.frame <- function(.data, ..., .dots = list()) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
+ rename(.data, !!! dots)
+}
# Joins ------------------------------------------------------------------------
@@ -155,73 +180,86 @@ setdiff.data.frame <- function(x, y, ...) setdiff_data_frame(x, y)
setequal.data.frame <- function(x, y, ...) equal_data_frame(x, y)
#' @export
-distinct_.data.frame <- function(.data, ..., .dots, .keep_all = FALSE) {
- dist <- distinct_vars(.data, ..., .dots = .dots, .keep_all = .keep_all)
+distinct.data.frame <- function(.data, ..., .keep_all = FALSE) {
+ dist <- distinct_vars(.data, named_quos(...), .keep_all = .keep_all)
distinct_impl(dist$data, dist$vars, dist$keep)
}
+#' @export
+distinct_.data.frame <- function(.data, ..., .dots = list(), .keep_all = FALSE) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
+ distinct(.data, !!! dots, .keep_all = .keep_all)
+}
# Do ---------------------------------------------------------------------------
#' @export
-do_.data.frame <- function(.data, ..., .dots) {
- args <- lazyeval::all_dots(.dots, ...)
+do.data.frame <- function(.data, ...) {
+ args <- quos(...)
named <- named_args(args)
- data <- list(. = .data)
+ # Create custom dynamic scope with `.` pronoun
+ # FIXME: Pass without splicing once child_env() calls env_bind()
+ # with explicit arguments
+ overscope <- child_env(NULL, !!! list(. = .data, .data = .data))
if (!named) {
- env <- new.env(parent = args[[1]]$env)
- env$. <- .data
-
- out <- lazyeval::lazy_eval(args[[1]], data)
- if (!is.data.frame(out)) {
- stop("Result must be a data frame", call. = FALSE)
+ out <- eval_tidy_(args[[1]], overscope)
+ if (!inherits(out, "data.frame")) {
+ bad("Result must be a data frame, not {fmt_classes(out)}")
}
} else {
- out <- lapply(args, function(arg) {
- list(lazyeval::lazy_eval(arg, data))
- })
+ out <- map(args, function(arg) list(eval_tidy_(arg, overscope)))
names(out) <- names(args)
- attr(out, "row.names") <- .set_row_names(1L)
- # Use tbl_df to ensure safe printing of list columns
- class(out) <- c("tbl_df", "data.frame")
+ out <- tibble::as_tibble(out, validate = FALSE)
}
out
}
+#' @export
+do_.data.frame <- function(.data, ..., .dots = list()) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
+ do(.data, !!! dots)
+}
# Random samples ---------------------------------------------------------------
#' @export
-sample_n.data.frame <- function(tbl, size, replace = FALSE, weight = NULL,
- .env = parent.frame()) {
- if (!missing(weight)) {
- weight <- eval(substitute(weight), tbl, .env)
+sample_n.data.frame <- function(tbl, size, replace = FALSE,
+ weight = NULL, .env = NULL) {
+ if (!is_null(.env)) {
+ inform("`.env` is deprecated and no longer has any effect")
}
- sample_n_basic(tbl, size, replace = replace, weight = weight)
+ weight <- eval_tidy(enquo(weight), tbl)
+ sample_n_basic(tbl, size, FALSE, replace = replace, weight = weight)
}
#' @export
-sample_frac.data.frame <- function(tbl, size = 1, replace = FALSE, weight = NULL,
- .env = parent.frame()) {
-
- if (!missing(weight)) {
- weight <- eval(substitute(weight), tbl, .env)
+sample_frac.data.frame <- function(tbl, size = 1, replace = FALSE,
+ weight = NULL, .env = NULL) {
+ if (!is_null(.env)) {
+ inform("`.env` is deprecated and no longer has any effect")
}
- sample_n_basic(tbl, round(size * nrow(tbl)), replace = replace, weight = weight)
+ weight <- eval_tidy(enquo(weight), tbl)
+ sample_n_basic(tbl, size, TRUE, replace = replace, weight = weight)
}
-sample_n_basic <- function(tbl, size, replace = FALSE, weight = NULL) {
+sample_n_basic <- function(tbl, size, frac, replace = FALSE, weight = NULL) {
n <- nrow(tbl)
weight <- check_weight(weight, n)
assert_that(is.numeric(size), length(size) == 1, size >= 0)
- check_size(size, n, replace)
+
+ if (frac) {
+ check_frac(size, replace)
+ size <- round(size * n)
+ } else {
+ check_size(size, n, replace)
+ }
idx <- sample.int(n, size, replace = replace, prob = weight)
tbl[idx, , drop = FALSE]
diff --git a/R/dbi-s3.r b/R/dbi-s3.r
deleted file mode 100644
index 40287c9..0000000
--- a/R/dbi-s3.r
+++ /dev/null
@@ -1,288 +0,0 @@
-#' @import DBI
-NULL
-
-
-#' Source generics.
-#'
-#' These generics retrieve metadata for a given src.
-#'
-#' @keywords internal
-#' @name backend_src
-NULL
-
-#' @name backend_src
-#' @export
-src_desc <- function(x) UseMethod("src_desc")
-
-#' @name backend_src
-#' @export
-sql_translate_env <- function(con) UseMethod("sql_translate_env")
-
-#' @name backend_src
-#' @export
-sql_translate_env.NULL <- function(con) {
- sql_variant(
- base_scalar,
- base_agg,
- base_win
- )
-}
-
-#' Database generics.
-#'
-#' These generics execute actions on the database. Most generics have a method
-#' for \code{DBIConnection} which typically just call the standard DBI S4
-#' method.
-#'
-#' Note, a few backend methods do not call the standard DBI S4 methods including
-#' \itemize{
-#' \item \code{db_data_type}: Calls DBI's \code{dbDataType} for every field
-#' (e.g. data frame column) and returns a vector of corresponding SQL data
-#' types
-#' \item \code{db_save_query}: Builds and executes \code{CREATE [TEMPORARY]
-#' TABLE <table> ...} SQL command.
-#' \item \code{db_create_table}: Builds and executes \code{CREATE [TEMPORARY]
-#' TABLE <table> ...} SQL command.
-#' \item \code{db_create_index}: Builds and executes \code{CREATE INDEX <name>
-#' ON <table>} SQL command.
-#' \item \code{db_drop_table}: Builds and executes \code{DROP TABLE [IF EXISTS]
-#' <table>} SQL command.
-#' \item \code{db_analyze}: Builds and executes \code{ANALYZE <table>} SQL
-#' command.
-#' \item \code{db_insert_into} and \code{db_explain}: do not have methods
-#' calling corresponding DBI methods. The latter because no underlying DBI S4
-#' method exists and the former because calls to the corresponding DBI S4
-#' method (\code{dbWriteTable}) need to be able to specify an appropriate
-#' combination of values for non-standard \code{append} and \code{overwrite}
-#' arguments.
-#' }
-#'
-#' Currently, \code{copy_to} is the only user of \code{db_begin()}, \code{db_commit()},
-#' \code{db_rollback()}, \code{db_create_table()}, \code{db_insert_into()},
-#' \code{db_create_indexes()}, \code{db_drop_table()} and
-#' \code{db_analyze()}. If you find yourself overriding many of these
-#' functions it may suggest that you should just override \code{\link{copy_to}}
-#' instead.
-#'
-#' @return Usually a logical value indicating success. Most failures should generate
-#' an error. However, \code{db_has_table()} should return \code{NA} if
-#' temporary tables cannot be listed with \code{dbListTables} (due to backend
-#' API limitations for example). As a result, you methods will rely on the
-#' backend to throw an error if a table exists when it shouldn't.
-#' @name backend_db
-#' @param con A database connection.
-#' @keywords internal
-NULL
-
-#' @name backend_db
-#' @export
-db_list_tables <- function(con) UseMethod("db_list_tables")
-#' @export
-db_list_tables.DBIConnection <- function(con) dbListTables(con)
-
-#' @name backend_db
-#' @export
-#' @param table A string, the table name.
-db_has_table <- function(con, table) UseMethod("db_has_table")
-#' @export
-db_has_table.DBIConnection <- function(con, table) dbExistsTable(con, table)
-
-#' @name backend_db
-#' @export
-#' @param fields A list of fields, as in a data frame.
-db_data_type <- function(con, fields) UseMethod("db_data_type")
-#' @export
-db_data_type.DBIConnection <- function(con, fields) {
- vapply(fields, dbDataType, dbObj = con, FUN.VALUE = character(1))
-}
-
-#' @name backend_db
-#' @export
-db_save_query <- function(con, sql, name, temporary = TRUE, ...) {
- UseMethod("db_save_query")
-}
-
-#' @export
-db_save_query.DBIConnection <- function(con, sql, name, temporary = TRUE,
- ...) {
- tt_sql <- build_sql("CREATE ", if (temporary) sql("TEMPORARY "),
- "TABLE ", ident(name), " AS ", sql, con = con)
- dbGetQuery(con, tt_sql)
- name
-}
-
-#' @name backend_db
-#' @export
-db_begin <- function(con, ...) UseMethod("db_begin")
-#' @export
-db_begin.DBIConnection <- function(con, ...) {
- dbBegin(con)
-}
-
-#' @name backend_db
-#' @export
-db_commit <- function(con, ...) UseMethod("db_commit")
-#' @export
-db_commit.DBIConnection <- function(con, ...) dbCommit(con)
-
-#' @name backend_db
-#' @export
-db_rollback <- function(con, ...) UseMethod("db_rollback")
-#' @export
-db_rollback.DBIConnection <- function(con, ...) dbRollback(con)
-
-#' @name backend_db
-#' @export
-db_create_table <- function(con, table, types, temporary = FALSE, ...) {
- UseMethod("db_create_table")
-}
-#' @export
-db_create_table.DBIConnection <- function(con, table, types,
- temporary = FALSE, ...) {
- assert_that(is.string(table), is.character(types))
-
- field_names <- escape(ident(names(types)), collapse = NULL, con = con)
- fields <- sql_vector(paste0(field_names, " ", types), parens = TRUE,
- collapse = ", ", con = con)
- sql <- build_sql("CREATE ", if (temporary) sql("TEMPORARY "),
- "TABLE ", ident(table), " ", fields, con = con)
-
- dbGetQuery(con, sql)
-}
-
-#' @name backend_db
-#' @export
-db_insert_into <- function(con, table, values, ...) {
- UseMethod("db_insert_into")
-}
-
-#' @name backend_db
-#' @export
-db_create_indexes <- function(con, table, indexes = NULL, unique = FALSE, ...) {
- UseMethod("db_create_indexes")
-}
-
-#' @export
-db_create_indexes.DBIConnection <- function(con, table, indexes = NULL,
- unique = FALSE, ...) {
- if (is.null(indexes)) return()
- assert_that(is.list(indexes))
-
- for(index in indexes) {
- db_create_index(con, table, index, unique = unique, ...)
- }
-}
-
-#' @name backend_db
-#' @export
-db_create_index <- function(con, table, columns, name = NULL, unique = FALSE,
- ...) {
- UseMethod("db_create_index")
-}
-
-#' @export
-db_create_index.DBIConnection <- function(con, table, columns, name = NULL,
- unique = FALSE, ...) {
- assert_that(is.string(table), is.character(columns))
-
- name <- name %||% paste0(c(table, columns), collapse = "_")
- fields <- escape(ident(columns), parens = TRUE, con = con)
- sql <- build_sql(
- "CREATE ", if (unique) sql("UNIQUE "), "INDEX ", ident(name),
- " ON ", ident(table), " ", fields,
- con = con)
-
- dbGetQuery(con, sql)
-}
-
-#' @name backend_db
-#' @export
-db_drop_table <- function(con, table, force = FALSE, ...) {
- UseMethod("db_drop_table")
-}
-#' @export
-db_drop_table.DBIConnection <- function(con, table, force = FALSE, ...) {
- sql <- build_sql("DROP TABLE ", if (force) sql("IF EXISTS "), ident(table),
- con = con)
- dbGetQuery(con, sql)
-}
-
-#' @name backend_db
-#' @export
-db_analyze <- function(con, table, ...) UseMethod("db_analyze")
-#' @export
-db_analyze.DBIConnection <- function(con, table, ...) {
- sql <- build_sql("ANALYZE ", ident(table), con = con)
- dbGetQuery(con, sql)
-}
-
-#' @export
-#' @rdname backend_db
-db_explain <- function(con, sql, ...) {
- UseMethod("db_explain")
-}
-
-#' @export
-db_explain.DBIConnection <- function(con, sql, ...) {
- exsql <- build_sql("EXPLAIN ", sql, con = con)
- expl <- dbGetQuery(con, exsql)
- out <- utils::capture.output(print(expl))
-
- paste(out, collapse = "\n")
-}
-
-#' @rdname backend_db
-#' @export
-db_query_fields <- function(con, sql, ...) {
- UseMethod("db_query_fields")
-}
-#' @export
-db_query_fields.DBIConnection <- function(con, sql, ...) {
- sql <- sql_select(con, sql("*"), sql_subquery(con, sql), where = sql("0 = 1"))
- qry <- dbSendQuery(con, sql)
- on.exit(dbClearResult(qry))
-
- res <- dbFetch(qry, 0)
- names(res)
-}
-
-#' @rdname backend_db
-#' @export
-db_query_rows <- function(con, sql, ...) {
- UseMethod("db_query_rows")
-}
-#' @export
-db_query_rows.DBIConnection <- function(con, sql, ...) {
- from <- sql_subquery(con, sql, "master")
- rows <- build_sql("SELECT count(*) FROM ", from, con = con)
-
- as.integer(dbGetQuery(con, rows)[[1]])
-}
-
-# Utility functions ------------------------------------------------------------
-
-random_table_name <- function(n = 10) {
- paste0(sample(letters, n, replace = TRUE), collapse = "")
-}
-
-# Creates an environment that disconnects the database when it's
-# garbage collected
-db_disconnector <- function(con, name, quiet = FALSE) {
- reg.finalizer(environment(), function(...) {
- if (!quiet) {
- message("Auto-disconnecting ", name, " connection ",
- "(", paste(con at Id, collapse = ", "), ")")
- }
- dbDisconnect(con)
- })
- environment()
-}
-
-res_warn_incomplete <- function(res, hint = "n = -1") {
- if (dbHasCompleted(res)) return()
-
- rows <- big_mark(dbGetRowCount(res))
- warning("Only first ", rows, " results retrieved. Use ", hint, " to retrieve all.",
- call. = FALSE)
-}
-
diff --git a/R/dbplyr.R b/R/dbplyr.R
new file mode 100644
index 0000000..c40fa64
--- /dev/null
+++ b/R/dbplyr.R
@@ -0,0 +1,184 @@
+#' Database and SQL generics.
+#'
+#' The `sql_` generics are used to build the different types of SQL queries.
+#' The default implementations in dbplyr generates ANSI 92 compliant SQL.
+#' The `db_` generics execute actions on the database. The default
+#' implementations in dbplyr typically just call the standard DBI S4
+#' method.
+#'
+#' A few backend methods do not call the standard DBI S4 methods including
+#'
+#' * `db_data_type()`: Calls [DBI::dbDataType()] for every field
+#' (e.g. data frame column) and returns a vector of corresponding SQL data
+#' types
+#'
+#' * `db_save_query()`: Builds and executes a
+#' `CREATE [TEMPORARY] TABLE <table> ...` SQL command.
+#'
+#' * `db_create_index()`: Builds and executes a
+#' `CREATE INDEX <name> ON <table>` SQL command.
+#'
+#' * `db_drop_table()`: Builds and executes a
+#' `DROP TABLE [IF EXISTS] <table>` SQL command.
+#'
+#' * `db_analyze()`: Builds and executes an
+#' `ANALYZE <table>` SQL command.
+#'
+#' Currently, [copy_to()] is the only user of `db_begin()`, `db_commit()`,
+#' `db_rollback()`, `db_write_table()`, `db_create_indexes()`, `db_drop_table()` and
+#' `db_analyze()`. If you find yourself overriding many of these
+#' functions it may suggest that you should just override `copy_to()`
+#' instead.
+#'
+#' `db_create_table()` and `db_insert_into()` have been deprecated
+#' in favour of `db_write_table()`.
+#'
+#' @return Usually a logical value indicating success. Most failures should generate
+#' an error. However, `db_has_table()` should return `NA` if
+#' temporary tables cannot be listed with [DBI::dbListTables()] (due to backend
+#' API limitations for example). As a result, you methods will rely on the
+#' backend to throw an error if a table exists when it shouldn't.
+#' @name backend_dbplyr
+#' @param con A database connection.
+#' @keywords internal
+NULL
+
+#' @name backend_dbplyr
+#' @export
+db_desc <- function(x) UseMethod("db_desc")
+
+#' @name backend_dbplyr
+#' @export
+sql_translate_env <- function(con) UseMethod("sql_translate_env")
+
+#' @name backend_dbplyr
+#' @export
+db_list_tables <- function(con) UseMethod("db_list_tables")
+
+#' @name backend_dbplyr
+#' @export
+#' @param table A string, the table name.
+db_has_table <- function(con, table) UseMethod("db_has_table")
+
+#' @name backend_dbplyr
+#' @export
+#' @param fields A list of fields, as in a data frame.
+db_data_type <- function(con, fields) UseMethod("db_data_type")
+#' @export
+
+#' @name backend_dbplyr
+#' @export
+db_save_query <- function(con, sql, name, temporary = TRUE, ...) {
+ UseMethod("db_save_query")
+}
+
+#' @name backend_dbplyr
+#' @export
+db_begin <- function(con, ...) UseMethod("db_begin")
+
+#' @name backend_dbplyr
+#' @export
+db_commit <- function(con, ...) UseMethod("db_commit")
+
+#' @name backend_dbplyr
+#' @export
+db_rollback <- function(con, ...) UseMethod("db_rollback")
+
+#' @name backend_dbplyr
+#' @export
+db_write_table <- function(con, table, types, values, temporary = FALSE, ...) {
+ UseMethod("db_write_table")
+}
+
+#' @name backend_dbplyr
+#' @export
+db_create_table <- function(con, table, types, temporary = FALSE, ...) {
+ UseMethod("db_create_table")
+}
+
+#' @name backend_dbplyr
+#' @export
+db_insert_into <- function(con, table, values, ...) {
+ UseMethod("db_insert_into")
+}
+
+#' @name backend_dbplyr
+#' @export
+db_create_indexes <- function(con, table, indexes = NULL, unique = FALSE, ...) {
+ UseMethod("db_create_indexes")
+}
+
+#' @name backend_dbplyr
+#' @export
+db_create_index <- function(con, table, columns, name = NULL, unique = FALSE,
+ ...) {
+ UseMethod("db_create_index")
+}
+
+#' @name backend_dbplyr
+#' @export
+db_drop_table <- function(con, table, force = FALSE, ...) {
+ UseMethod("db_drop_table")
+}
+
+#' @name backend_dbplyr
+#' @export
+db_analyze <- function(con, table, ...) UseMethod("db_analyze")
+
+#' @export
+#' @rdname backend_dbplyr
+db_explain <- function(con, sql, ...) {
+ UseMethod("db_explain")
+}
+
+#' @rdname backend_dbplyr
+#' @export
+db_query_fields <- function(con, sql, ...) {
+ UseMethod("db_query_fields")
+}
+
+#' @rdname backend_dbplyr
+#' @export
+db_query_rows <- function(con, sql, ...) {
+ UseMethod("db_query_rows")
+}
+
+#' @rdname backend_dbplyr
+#' @export
+sql_select <- function(con, select, from, where = NULL, group_by = NULL,
+ having = NULL, order_by = NULL, limit = NULL,
+ distinct = FALSE, ...) {
+ UseMethod("sql_select")
+}
+
+#' @export
+#' @rdname backend_dbplyr
+sql_subquery <- function(con, from, name = random_table_name(), ...) {
+ UseMethod("sql_subquery")
+}
+
+#' @rdname backend_dbplyr
+#' @export
+sql_join <- function(con, x, y, vars, type = "inner", by = NULL, ...) {
+ UseMethod("sql_join")
+}
+
+#' @rdname backend_dbplyr
+#' @export
+sql_semi_join <- function(con, x, y, anti = FALSE, by = NULL, ...) {
+ UseMethod("sql_semi_join")
+}
+
+#' @rdname backend_dbplyr
+#' @export
+sql_set_op <- function(con, x, y, method) {
+ UseMethod("sql_set_op")
+}
+
+#' @rdname backend_dbplyr
+#' @export
+sql_escape_string <- function(con, x) UseMethod("sql_escape_string")
+
+#' @rdname backend_dbplyr
+#' @export
+sql_escape_ident <- function(con, x) UseMethod("sql_escape_ident")
diff --git a/R/desc.r b/R/desc.r
index e00571b..d654506 100644
--- a/R/desc.r
+++ b/R/desc.r
@@ -1,12 +1,16 @@
-#' Descending order.
+#' Descending order
#'
#' Transform a vector into a format that will be sorted in descending order.
+#' This is useful within [arrange()].
#'
#' @param x vector to transform
#' @export
#' @examples
#' desc(1:10)
#' desc(factor(letters))
+#'
#' first_day <- seq(as.Date("1910/1/1"), as.Date("1920/1/1"), "years")
#' desc(first_day)
+#'
+#' starwars %>% arrange(desc(mass))
desc <- function(x) -xtfrm(x)
diff --git a/R/distinct.R b/R/distinct.R
index fd08d69..218de5e 100644
--- a/R/distinct.R
+++ b/R/distinct.R
@@ -1,19 +1,19 @@
-#' Select distinct/unique rows.
+#' Select distinct/unique rows
#'
#' Retain only unique/distinct rows from an input tbl. This is similar
-#' to \code{\link{unique.data.frame}}, but considerably faster.
+#' to [unique.data.frame()], but considerably faster.
#'
#' @param .data a tbl
#' @param ... Optional variables to use when determining uniqueness. If there
#' are multiple rows for a given combination of inputs, only the first
#' row will be preserved. If omitted, will use all variables.
-#' @param .keep_all If \code{TRUE}, keep all variables in \code{.data}.
-#' If a combination of \code{...} is not distinct, this keeps the
+#' @param .keep_all If `TRUE`, keep all variables in `.data`.
+#' If a combination of `...` is not distinct, this keeps the
#' first row of values.
#' @inheritParams filter
#' @export
#' @examples
-#' df <- data.frame(
+#' df <- tibble(
#' x = sample(10, 100, rep = TRUE),
#' y = sample(10, 100, rep = TRUE)
#' )
@@ -30,12 +30,25 @@
#'
#' # You can also use distinct on computed variables
#' distinct(df, diff = abs(x - y))
+#'
+#' # The same behaviour applies for grouped data frames
+#' # except that the grouping variables are always included
+#' df <- tibble(
+#' g = c(1, 1, 2, 2),
+#' x = c(1, 1, 2, 1)
+#' ) %>% group_by(g)
+#' df %>% distinct()
+#' df %>% distinct(x)
distinct <- function(.data, ..., .keep_all = FALSE) {
- distinct_(.data, .dots = lazyeval::lazy_dots(...), .keep_all = .keep_all)
+ UseMethod("distinct")
+}
+#' @export
+distinct.default <- function(.data, ..., .keep_all = FALSE) {
+ distinct_(.data, .dots = compat_as_lazy_dots(...), .keep_all = .keep_all)
}
-
#' @export
-#' @rdname distinct
+#' @rdname se-deprecated
+#' @inheritParams distinct
distinct_ <- function(.data, ..., .dots, .keep_all = FALSE) {
UseMethod("distinct_")
}
@@ -43,44 +56,48 @@ distinct_ <- function(.data, ..., .dots, .keep_all = FALSE) {
#' Same basic philosophy as group_by: lazy_dots comes in, list of data and
#' vars (character vector) comes out.
#' @noRd
-distinct_vars <- function(.data, ..., .dots, .keep_all = FALSE) {
- dots <- lazyeval::all_dots(.dots, ..., all_named = TRUE)
+distinct_vars <- function(.data, vars, group_vars = character(), .keep_all = FALSE) {
+ stopifnot(is_quosures(vars), is.character(group_vars))
# If no input, keep all variables
- if (length(dots) == 0) {
- return(list(data = .data, vars = names(.data), keep = names(.data)))
+ if (length(vars) == 0) {
+ return(list(
+ data = .data,
+ vars = names(.data),
+ keep = names(.data)
+ ))
}
# If any calls, use mutate to add new columns, then distinct on those
- needs_mutate <- vapply(dots, function(x) !is.name(x$expr), logical(1))
+ needs_mutate <- map_lgl(vars, quo_is_lang)
if (any(needs_mutate)) {
- .data <- mutate_(.data, .dots = dots[needs_mutate])
+ .data <- mutate(.data, !!! vars[needs_mutate])
}
# Once we've done the mutate, we no longer need lazy objects, and
# can instead just use their names
- vars <- names(dots)
+ out_vars <- intersect(names(.data), c(names(vars), group_vars))
if (.keep_all) {
keep <- names(.data)
} else {
- keep <- vars
+ keep <- unique(out_vars)
}
- list(data = .data, vars = vars, keep = keep)
+ list(data = .data, vars = out_vars, keep = keep)
}
#' Efficiently count the number of unique values in a set of vector
#'
-#' This is a faster and more concise equivalent of \code{length(unique(x))}
+#' This is a faster and more concise equivalent of `length(unique(x))`
#'
#' @param \dots vectors of values
-#' @param na.rm id \code{TRUE} missing values don't count
+#' @param na.rm if `TRUE` missing values don't count
#' @examples
#' x <- sample(1:10, 1e5, rep = TRUE)
#' length(unique(x))
#' n_distinct(x)
#' @export
-n_distinct <- function(..., na.rm = FALSE){
+n_distinct <- function(..., na.rm = FALSE) {
n_distinct_multi(list(...), na.rm)
}
diff --git a/R/do.r b/R/do.r
index 2dbe4e9..080abb6 100644
--- a/R/do.r
+++ b/R/do.r
@@ -1,13 +1,13 @@
-#' Do arbitrary operations on a tbl.
+#' Do anything
#'
#' This is a general purpose complement to the specialised manipulation
-#' functions \code{\link{filter}}, \code{\link{select}}, \code{\link{mutate}},
-#' \code{\link{summarise}} and \code{\link{arrange}}. You can use \code{do}
+#' functions [filter()], [select()], [mutate()],
+#' [summarise()] and [arrange()]. You can use `do()`
#' to perform arbitrary computation, returning either a data frame or
#' arbitrary objects which will be stored in a list. This is particularly
#' useful when working with models: you can fit models per group with
-#' \code{do} and then flexibly extract components with either another
-#' \code{do} or \code{summarise}.
+#' `do()` and then flexibly extract components with either another
+#' `do()` or `summarise()`.
#'
#' For an empty data frame, the expressions will be evaluated once, even in the
#' presence of a grouping. This makes sure that the format of the resulting
@@ -15,30 +15,30 @@
#'
#' @section Connection to plyr:
#'
-#' If you're familiar with plyr, \code{do} with named arguments is basically
-#' equivalent to \code{dlply}, and \code{do} with a single unnamed argument
-#' is basically equivalent to \code{ldply}. However, instead of storing
+#' If you're familiar with plyr, `do()` with named arguments is basically
+#' equivalent to [plyr::dlply()], and `do()` with a single unnamed argument
+#' is basically equivalent to [plyr::ldply()]. However, instead of storing
#' labels in a separate attribute, the result is always a data frame. This
-#' means that \code{summarise} applied to the result of \code{do} can
-#' act like \code{ldply}.
+#' means that `summarise()` applied to the result of `do()` can
+#' act like `ldply()`.
#'
#' @inheritParams filter
#' @param .data a tbl
#' @param ... Expressions to apply to each group. If named, results will be
#' stored in a new column. If unnamed, should return a data frame. You can
-#' use \code{.} to refer to the current group. You can not mix named and
+#' use `.` to refer to the current group. You can not mix named and
#' unnamed arguments.
#' @return
-#' \code{do} always returns a data frame. The first columns in the data frame
-#' will be the labels, the others will be computed from \code{...}. Named
+#' `do()` always returns a data frame. The first columns in the data frame
+#' will be the labels, the others will be computed from `...`. Named
#' arguments become list-columns, with one element for each group; unnamed
#' elements must be data frames and labels will be duplicated accordingly.
#'
#' Groups are preserved for a single unnamed input. This is different to
-#' \code{\link{summarise}} because \code{do} generally does not reduce the
+#' [summarise()] because `do()` generally does not reduce the
#' complexity of the data, it just expresses it in a special way. For
#' multiple named inputs, the output is grouped by row with
-#' \code{\link{rowwise}}. This allows other verbs to work in an intuitive
+#' [rowwise()]. This allows other verbs to work in an intuitive
#' way.
#' @export
#' @examples
@@ -81,17 +81,24 @@
#' }
#' }
do <- function(.data, ...) {
- do_(.data, .dots = lazyeval::lazy_dots(...))
+ UseMethod("do")
+}
+#' @export
+do.default <- function(.data, ...) {
+ do_(.data, .dots = compat_as_lazy_dots(...))
}
-
#' @export
-#' @rdname do
-do_ <- function(.data, ..., .dots) {
+#' @rdname se-deprecated
+do_ <- function(.data, ..., .dots = list()) {
UseMethod("do_")
}
#' @export
-do_.NULL <- function(.data, ..., .dots) {
+do.NULL <- function(.data, ...) {
+ NULL
+}
+#' @export
+do_.NULL <- function(.data, ..., .dots = list()) {
NULL
}
@@ -100,8 +107,10 @@ do_.NULL <- function(.data, ..., .dots) {
label_output_dataframe <- function(labels, out, groups) {
data_frame <- vapply(out[[1]], is.data.frame, logical(1))
if (any(!data_frame)) {
- stop("Results are not data frames at positions: ",
- paste(which(!data_frame), collapse = ", "), call. = FALSE)
+ bad("Results {bad} must be data frames, not {first_bad_class}",
+ bad = fmt_comma(which(!data_frame)),
+ first_bad_class = fmt_classes(out[[1]][[which.min(data_frame)]])
+ )
}
rows <- vapply(out[[1]], nrow, numeric(1))
@@ -136,17 +145,15 @@ named_args <- function(args) {
# Arguments must either be all named or all unnamed.
named <- sum(names2(args) != "")
if (!(named == 0 || named == length(args))) {
- stop("Arguments to do() must either be all named or all unnamed",
- call. = FALSE)
+ abort("Arguments must either be all named or all unnamed")
}
if (named == 0 && length(args) > 1) {
- stop("Can only supply single unnamed argument to do()", call. = FALSE)
+ bad("Can only supply one unnamed argument, not {length(args)}")
}
# Check for old syntax
if (named == 1 && names(args) == ".f") {
- stop("do syntax changed in dplyr 0.2. Please see documentation for details",
- call. = FALSE)
+ abort("do syntax changed in dplyr 0.2. Please see documentation for details")
}
named != 0
diff --git a/R/dplyr.r b/R/dplyr.r
index 8510c34..3d54416 100644
--- a/R/dplyr.r
+++ b/R/dplyr.r
@@ -2,7 +2,7 @@
#'
#' dplyr provides a flexible grammar of data manipulation. It's the next
#' iteration of plyr, focused on tools for working with data frames (hence the
-#' \emph{d} in the name).
+#' *d* in the name).
#'
#' It has three main goals:
#'
@@ -16,13 +16,32 @@
#' }
#'
#' To learn more about dplyr, start with the vignettes:
-#' \code{browseVignettes(package = "dplyr")}
+#' `browseVignettes(package = "dplyr")`
#'
-#' @docType package
-#' @name dplyr
-#' @useDynLib dplyr
-#' @import assertthat
-#' @importFrom utils head tail
+#' @section Package options:
+#' \describe{
+#' \item{`dplyr.show_progress`}{Should lengthy operations such as `do()`
+#' show a progress bar? Default: `TRUE`}
+#' }
+#'
+#' @section Package configurations:
+#' These can be set on a package-by-package basis, or for the global environment.
+#' See [pkgconfig::set_config()] for usage.
+#' \describe{
+#' \item{`dplyr::na_matches`}{Should `NA` values be matched in data frame joins
+#' by default? Default: `"na"` (for compatibility with dplyr v0.5.0 and earlier,
+#' subject to change), alternative value: `"never"` (the default
+#' for database backends, see [join.tbl_df()]).}
+#' }
+#'
+#' @useDynLib dplyr, .registration = TRUE
+#' @import rlang
+#' @importFrom assertthat assert_that is.flag on_failure<-
+#' @importFrom glue glue
#' @importFrom Rcpp cppFunction Rcpp.plugin.maker
#' @importFrom stats setNames update
-NULL
+#' @importFrom utils head tail
+#' @importFrom methods is
+#' @importFrom bindrcpp create_env
+#' @importFrom pkgconfig get_config
+"_PACKAGE"
diff --git a/R/dr.R b/R/dr.R
new file mode 100644
index 0000000..8bb2795
--- /dev/null
+++ b/R/dr.R
@@ -0,0 +1,35 @@
+Rcpp_version <- utils::packageVersion("Rcpp")
+R_version <- R.version.string
+
+#' Dr Dplyr checks your installation for common problems.
+#'
+#' Only run this if you are seeing problems, like random crashes.
+#' It's possible for `dr_dplyr` to return false positives, so there's no
+#' need to run if all is ok.
+#'
+#' @export
+#' @examples
+#' \dontrun{
+#' dr_dplyr()
+#' }
+dr_dplyr <- function() {
+ if (Rcpp_version != utils::packageVersion("Rcpp")) {
+ warning(
+ "Installed Rcpp (", utils::packageVersion("Rcpp"), ") different from ",
+ "Rcpp used to build dplyr (", Rcpp_version, ").\n",
+ "Please reinstall dplyr to avoid random crashes or undefined behavior.",
+ call. = FALSE
+ )
+ }
+
+ if (R_version != R.version.string) {
+ warning(
+ "Installed R (", R.version.string, ") different from ",
+ "R used to build dplyr (", R_version, ").\n",
+ "Please reinstall dplyr to avoid random crashes or undefined behavior.",
+ call. = FALSE
+ )
+ }
+
+ invisible(NULL)
+}
diff --git a/R/error.R b/R/error.R
new file mode 100644
index 0000000..117981e
--- /dev/null
+++ b/R/error.R
@@ -0,0 +1,133 @@
+check_pkg <- function(name, reason, install = TRUE) {
+ if (is_installed(name))
+ return(invisible(TRUE))
+
+ glubort(NULL,
+ 'The {name} package is required to {reason}.',
+ if (install) '\nPlease install it with `install.packages("{name}")`'
+ )
+}
+
+# ngettext() does extra work, this function is a simpler version
+ntext <- function(n, msg1, msg2) {
+ if (n == 1) msg1 else msg2
+}
+
+bad <- function(..., .envir = parent.frame()) {
+ glubort(NULL, ..., .envir = parent.frame())
+}
+
+bad_args <- function(args, ..., .envir = parent.frame()) {
+ glubort(fmt_args(args), ..., .envir = .envir)
+}
+
+bad_pos_args <- function(pos_args, ..., .envir = parent.frame()) {
+ glubort(fmt_pos_args(pos_args), ..., .envir = .envir)
+}
+
+bad_calls <- function(calls, ..., .envir = parent.frame()) {
+ glubort(fmt_calls(calls), ..., .envir = .envir)
+}
+
+bad_named_calls <- function(named_calls, ..., .envir = parent.frame()) {
+ glubort(fmt_named_calls(named_calls), ..., .envir = .envir)
+}
+
+bad_eq_ops <- function(named_calls, ..., .envir = parent.frame()) {
+ glubort(fmt_wrong_eq_ops(named_calls), ..., .envir = .envir)
+}
+
+bad_cols <- function(cols, ..., .envir = parent.frame()) {
+ glubort(fmt_cols(cols), ..., .envir = .envir)
+}
+
+bad_measures <- function(measures, ..., .envir = parent.frame()) {
+ glubort(fmt_measures(measures), ..., .envir = .envir)
+}
+
+glubort <- function(header, ..., .envir = parent.frame(), .abort = abort) {
+ text <- glue(..., .envir = .envir)
+ if (!is_null(header)) text <- paste0(header, " ", text)
+ .abort(text)
+}
+
+fmt_args <- function(x) {
+ x <- parse_args(x)
+ fmt_obj(x)
+}
+
+fmt_pos_args <- function(x) {
+ args <- ntext(length(x), "Argument", "Arguments")
+ glue("{args} {fmt_comma(x)}")
+}
+
+fmt_calls <- function(...) {
+ x <- parse_named_call(...)
+ fmt_obj(x)
+}
+
+fmt_named_calls <- function(...) {
+ x <- parse_named_call(...)
+ fmt_named(x)
+}
+
+fmt_wrong_eq_ops <- function(...) {
+ x <- parse_named_call(...)
+ fmt_comma(
+ paste0(fmt_obj1(names2(x)), " (", fmt_obj1(paste0(names2(x), " = ", x)), ")")
+ )
+}
+
+fmt_cols <- function(x) {
+ cols <- ntext(length(x), "Column", "Columns")
+ glue("{cols} {fmt_obj(x)}")
+}
+
+fmt_measures <- function(x) {
+ measures <- ntext(length(x), "Measure", "Measures")
+ glue("{measures} {fmt_obj(x)}")
+}
+
+fmt_named <- function(x) {
+ fmt_comma(paste0(fmt_obj1(names2(x)), " = ", x))
+}
+
+fmt_obj <- function(x) {
+ fmt_comma(fmt_obj1(x))
+}
+
+fmt_obj1 <- function(x) {
+ paste0("`", x, "`")
+}
+
+fmt_classes <- function(x) {
+ paste(class(x), collapse = "/")
+}
+
+fmt_dims <- function(x) {
+ paste0("[", paste0(x, collapse = " x "), "]")
+}
+
+fmt_comma <- function(...) {
+ MAX_ITEMS <- 6L
+
+ x <- paste0(...)
+ if (length(x) > MAX_ITEMS) {
+ length(x) <- MAX_ITEMS
+ x[[MAX_ITEMS]] <- "..."
+ }
+
+ commas(x)
+}
+
+parse_args <- function(x) {
+ # convert single formula to list of length 1
+ x <- unlist(list(x), recursive = FALSE)
+ is_fml <- map_lgl(x, is_formula)
+ x[is_fml] <- map_chr(map(x[is_fml], "[[", 2), as_string)
+ unlist(x)
+}
+
+parse_named_call <- function(x) {
+ map_chr(x, quo_text)
+}
diff --git a/R/explain.r b/R/explain.r
index b33592d..3dc3028 100644
--- a/R/explain.r
+++ b/R/explain.r
@@ -1,21 +1,22 @@
-#' Explain details of a tbl.
+#' Explain details of a tbl
#'
#' This is a generic function which gives more details about an object than
-#' \code{\link{print}}, and is more focussed on human readable output than
-#' \code{\link{str}}.
+#' [print()], and is more focused on human readable output than
+#' [str()].
#'
#' @section Databases:
-#' Explaining a \code{tbl_sql} will run the SQL \code{EXPLAIN} command which
+#' Explaining a `tbl_sql` will run the SQL `EXPLAIN` command which
#' will describe the query plan. This requires a little bit of knowledge about
-#' how \code{EXPLAIN} works for your database, but is very useful for
+#' how `EXPLAIN` works for your database, but is very useful for
#' diagnosing performance problems.
#'
#' @export
#' @param x An object to explain
#' @param ... Other parameters possibly used by generic
+#' @return The first argument, invisibly.
#' @examples
#' \donttest{
-#' if (require("RSQLite") && has_lahman("sqlite")) {
+#' if (require("dbplyr")) {
#'
#' lahman_s <- lahman_sqlite()
#' batting <- tbl(lahman_s, "Batting")
@@ -39,17 +40,7 @@ explain <- function(x, ...) {
}
#' @export
-explain.tbl_sql <- function(x, ...) {
- force(x)
- show_query(x)
- message("\n")
- message("<PLAN>\n", db_explain(x$src$con, sql_render(x)))
-
- invisible(NULL)
-}
-
-#' @export
#' @rdname explain
-show_query <- function(x) {
- message("<SQL>\n", sql_render(x))
+show_query <- function(x, ...) {
+ UseMethod("show_query")
}
diff --git a/R/failwith.r b/R/failwith.r
index ac79646..7dc30be 100644
--- a/R/failwith.r
+++ b/R/failwith.r
@@ -1,26 +1,17 @@
#' Fail with specified value.
#'
-#' Modify a function so that it returns a default value when there is an
-#' error.
+#' Deprecated. Please use [purrr::possibly()] instead.
#'
#' @param default default value
#' @param f function
#' @param quiet all error messages be suppressed?
#' @return a function
-#' @seealso \code{\link[plyr]{try_default}}
-#' @keywords debugging
+#' @seealso [plyr::try_default()]
+#' @keywords internal
#' @export
-#' @examples
-#' f <- function(x) if (x == 1) stop("Error!") else 1
-#' \dontrun{
-#' f(1)
-#' f(2)
-#' }
-#'
-#' safef <- failwith(NULL, f)
-#' safef(1)
-#' safef(2)
failwith <- function(default = NULL, f, quiet = FALSE) {
+ warn("Deprecated: please use `purrr::possibly()` instead")
+
function(...) {
out <- default
try(out <- f(...), silent = quiet)
diff --git a/R/funs-predicates.R b/R/funs-predicates.R
new file mode 100644
index 0000000..e421fe7
--- /dev/null
+++ b/R/funs-predicates.R
@@ -0,0 +1,48 @@
+## Return the union or intersection of predicate expressions.
+##
+## `all_exprs()` and `any_exprs()` take predicate expressions and join them
+## into a single predicate. They assume vectorised expressions by
+## default and join them with `&` or `|`. Note that this will also
+## work with scalar predicates, but if you want to be explicit you can
+## set `.vectorised` to `FALSE` to join by `&&` or `||`.
+##
+## @param ... Predicate expressions.
+## @param .vectorised If `TRUE`, predicates are joined with `&` or
+## `|`. Otherwise, they are joined with `&&` or `||`.
+## @return A [quosure][rlang::quo].
+## @export
+## @examples
+## all_exprs(cyl > 3, am == 1)
+## any_exprs(cyl > 3, am == 1)
+## any_exprs(cyl > 3, am == 1, .vectorised = FALSE)
+all_exprs <- function(..., .vectorised = TRUE) {
+ op <- if (.vectorised) quote(`&`) else quote(`&&`)
+ quo_reduce(..., .op = op)
+}
+## @rdname all_exprs
+## @export
+any_exprs <- function(..., .vectorised = TRUE) {
+ op <- if (.vectorised) quote(`|`) else quote(`||`)
+ quo_reduce(..., .op = op)
+}
+
+## @param .op Can be a function or a quoted name of a function. If a
+## quoted name, the default environment is the [base
+## environment][rlang::base_env] unless you supply a
+## [quosure][rlang::quo].
+quo_reduce <- function(..., .op) {
+ stopifnot(is_symbol(.op) || is_function(.op))
+
+ dots <- quos(...)
+ if (length(dots) == 0) {
+ abort("At least one expression must be given")
+ } else if (length(dots) == 1) {
+ return(dots[[1]])
+ }
+
+ op_quo <- as_quosure(.op, base_env())
+ op <- f_rhs(op_quo)
+
+ expr <- reduce(dots, function(x, y) expr(UQ(op)((!! x), (!! y))))
+ new_quosure(expr, f_env(op_quo))
+}
diff --git a/R/funs.R b/R/funs.R
index 46d6023..985f039 100644
--- a/R/funs.R
+++ b/R/funs.R
@@ -1,143 +1,133 @@
#' Create a list of functions calls.
#'
-#' \code{funs} provides a flexible way to generate a named list of functions for
-#' input to other functions like \code{summarise_each}.
+#' `funs()` provides a flexible way to generate a named list of
+#' functions for input to other functions like [summarise_at()].
#'
-#' @param dots,... A list of functions specified by:
+#' @param ... A list of functions specified by:
#'
-#' \itemize{
-#' \item Their name, \code{"mean"}
-#' \item The function itself, \code{mean}
-#' \item A call to the function with \code{.} as a dummy parameter,
-#' \code{mean(., na.rm = TRUE)}
-#' }
-#' @param args A named list of additional arguments to be added to all
-#' function calls.
-#' @param env The environment in which functions should be evaluated.
+#' - Their name, `"mean"`
+#' - The function itself, `mean`
+#' - A call to the function with `.` as a dummy argument,
+#' `mean(., na.rm = TRUE)`
+#'
+#' These arguments are automatically [quoted][rlang::quo]. They
+#' support [unquoting][rlang::quasiquotation] and splicing. See
+#' `vignette("programming")` for an introduction to these concepts.
+#' @param .args,args A named list of additional arguments to be added
+#' to all function calls.
#' @export
#' @examples
#' funs(mean, "mean", mean(., na.rm = TRUE))
#'
-#' # Overide default names
+#' # Override default names
#' funs(m1 = mean, m2 = "mean", m3 = mean(., na.rm = TRUE))
#'
#' # If you have function names in a vector, use funs_
#' fs <- c("min", "max")
#' funs_(fs)
-funs <- function(...) funs_(lazyeval::lazy_dots(...))
+funs <- function(..., .args = list()) {
+ dots <- quos(...)
+ default_env <- caller_env()
-#' @export
-#' @rdname funs
-funs_ <- function(dots, args = list(), env = baseenv()) {
- dots <- lazyeval::as.lazy_dots(dots, env)
- env <- lazyeval::common_env(dots)
-
- names(dots) <- names2(dots)
+ funs <- map(dots, function(quo) as_fun(quo, default_env, .args))
+ new_funs(funs)
+}
+new_funs <- function(funs) {
+ names(funs) <- names2(funs)
- dots[] <- lapply(dots, function(x) {
- x$expr <- make_call(x$expr, args)
- x
+ missing_names <- names(funs) == ""
+ default_names <- map_chr(funs[missing_names], function(dot) {
+ quo_name(node_car(f_rhs(dot)))
})
+ names(funs)[missing_names] <- default_names
- missing_names <- names(dots) == ""
- default_names <- vapply(dots[missing_names], function(x) make_name(x$expr),
- character(1))
- names(dots)[missing_names] <- default_names
-
- class(dots) <- c("fun_list", "lazy_dots")
- attr(dots, "has_names") <- any(!missing_names)
- dots
+ class(funs) <- "fun_list"
+ attr(funs, "have_name") <- any(!missing_names)
+ funs
}
-is.fun_list <- function(x, env) inherits(x, "fun_list")
+as_fun_list <- function(.x, .quo, .env, ...) {
+ # Capture quosure before evaluating .x
+ force(.quo)
+
+ # If a fun_list, update args
+ args <- dots_list(...)
+ if (is_fun_list(.x)) {
+ if (!is_empty(args)) {
+ .x[] <- map(.x, lang_modify, !!! args)
+ }
+ return(.x)
+ }
-as.fun_list <- function(.x, ..., .env = baseenv()) {
- UseMethod("as.fun_list")
-}
-#' @export
-as.fun_list.fun_list <- function(.x, ..., .env = baseenv()) {
- .x[] <- lapply(.x, function(fun) {
- fun$expr <- merge_args(fun$expr, list(...))
- fun
- })
+ # Take functions by expression if they are supplied by name. This
+ # way we can evaluate it hybridly.
+ if (is_function(.x) && quo_is_symbol(.quo)) {
+ .x <- list(.quo)
+ } else if (is_character(.x)) {
+ .x <- as.list(.x)
+ } else if (is_bare_formula(.x, lhs = FALSE)) {
+ .x <- list(as_function(.x))
+ } else if (!is_list(.x)) {
+ .x <- list(.x)
+ }
- .x
-}
-#' @export
-as.fun_list.character <- function(.x, ..., .env = baseenv()) {
- parsed <- lapply(.x, function(.x) parse(text = .x)[[1]])
- funs_(parsed, list(...), .env)
+ funs <- map(.x, as_fun, .env = fun_env(.quo, .env), args)
+ new_funs(funs)
}
-#' @export
-as.fun_list.function <- function(.x, ..., .env = baseenv()) {
- .env <- new.env(parent = .env)
- .env$`__dplyr_colwise_fun` <- .x
- call <- make_call("__dplyr_colwise_fun", list(...))
- dots <- lazyeval::as.lazy_dots(call, .env)
+as_fun <- function(.x, .env, .args) {
+ quo <- as_quosure(.x, .env)
+
+ # For legacy reasons, we support strings. Those are enclosed in the
+ # empty environment and need to be switched to the caller environment.
+ f_env(quo) <- fun_env(quo, .env)
- funs_(dots)
+ expr <- get_expr(.x)
+ if (is_lang(expr) && !is_lang(expr, c("::", ":::"))) {
+ expr <- lang_modify(expr, !!! .args)
+ } else {
+ expr <- lang(expr, quote(.), !!! .args)
+ }
+
+ set_expr(quo, expr)
}
+fun_env <- function(quo, default_env) {
+ env <- f_env(quo)
+ if (is_null(env) || identical(env, empty_env())) {
+ default_env
+ } else {
+ env
+ }
+}
+
+is_fun_list <- function(x, env) {
+ inherits(x, "fun_list")
+}
#' @export
`[.fun_list` <- function(x, i) {
- structure(
- NextMethod(),
- class = c("fun_list", "lazy_dots"),
+ structure(NextMethod(),
+ class = "fun_list",
has_names = attr(x, "has_names")
)
}
-
#' @export
print.fun_list <- function(x, ..., width = getOption("width")) {
cat("<fun_calls>\n")
names <- format(names(x))
- code <- vapply(x, function(x) {
- deparse_trunc(x$expr, width - 2 - nchar(names[1]))
- }, character(1))
+ code <- map_chr(x, function(x) deparse_trunc(f_rhs(x), width - 2 - nchar(names[1])))
cat(paste0("$ ", names, ": ", code, collapse = "\n"))
cat("\n")
invisible(x)
}
-make_call <- function(x, args) {
- if (is.character(x)) {
- call <- substitute(f(.), list(f = as.name(x)))
- } else if (is.name(x)) {
- call <- substitute(f(.), list(f = x))
- } else if (is.call(x)) {
- call <- x
- } else {
- stop("Unknown inputs")
- }
-
- merge_args(call, args)
-}
-make_name <- function(x) {
- if (is.character(x)) {
- x
- } else if (is.name(x)) {
- as.character(x)
- } else if (is.call(x)) {
- as.character(x[[1]])
- } else {
- stop("Unknown input:", class(x)[1])
- }
-}
-
-merge_args <- function(call, args) {
- if (!length(args)) {
- return(call)
- }
- if (is.null(names(args))) {
- stop("Additional arguments should be named", call. = FALSE)
- }
-
- for (param in names(args)) {
- call[[param]] <- args[[param]]
- }
-
- call
+#' @export
+#' @rdname se-deprecated
+#' @inheritParams funs
+#' @param env The environment in which functions should be evaluated.
+funs_ <- function(dots, args = list(), env = base_env()) {
+ dots <- compat_lazy_dots(dots, caller_env())
+ funs(!!! dots, .args = args)
}
diff --git a/R/group-by.r b/R/group-by.r
index 079aba3..267a614 100644
--- a/R/group-by.r
+++ b/R/group-by.r
@@ -1,73 +1,98 @@
-#' Group a tbl by one or more variables.
+#' Group by one or more variables
#'
-#' Most data operations are useful done on groups defined by variables in the
-#' the dataset. The \code{group_by} function takes an existing tbl
-#' and converts it into a grouped tbl where operations are performed
-#' "by group".
+#' @description
+#' Most data operations are done on groups defined by variables.
+#' `group_by()` takes an existing tbl and converts it into a grouped tbl
+#' where operations are performed "by group". `ungroup()` removes grouping.
#'
#' @section Tbl types:
#'
-#' \code{group_by} is an S3 generic with methods for the three built-in
+#' `group_by()` is an S3 generic with methods for the three built-in
#' tbls. See the help for the corresponding classes and their manip
#' methods for more details:
#'
#' \itemize{
-#' \item data.frame: \link{grouped_df}
-#' \item data.table: \link[dtplyr]{grouped_dt}
-#' \item SQLite: \code{\link{src_sqlite}}
-#' \item PostgreSQL: \code{\link{src_postgres}}
-#' \item MySQL: \code{\link{src_mysql}}
+#' \item data.frame: [grouped_df]
+#' \item data.table: [dtplyr::grouped_dt]
+#' \item SQLite: [src_sqlite()]
+#' \item PostgreSQL: [src_postgres()]
+#' \item MySQL: [src_mysql()]
#' }
#'
-#' @seealso \code{\link{ungroup}} for the inverse operation,
-#' \code{\link{groups}} for accessors that don't do special evaluation.
+#' @section Scoped grouping:
+#'
+#' The three [scoped] variants ([group_by_all()], [group_by_if()] and
+#' [group_by_at()]) make it easy to group a dataset by a selection of
+#' variables.
+#'
#' @param .data a tbl
-#' @param ... variables to group by. All tbls accept variable names,
-#' some will also accept functions of variables. Duplicated groups
+#' @param ... Variables to group by. All tbls accept variable names.
+#' Some tbls will accept functions of variables. Duplicated groups
#' will be silently dropped.
-#' @param add By default, when \code{add = FALSE}, \code{group_by} will
-#' override existing groups. To instead add to the existing groups,
-#' use \code{add = TRUE}
+#' @param add When `add = FALSE`, the default, `group_by()` will
+#' override existing groups. To add to the existing groups, use
+#' `add = TRUE`.
#' @inheritParams filter
#' @export
#' @examples
-#' by_cyl <- group_by(mtcars, cyl)
-#' summarise(by_cyl, mean(disp), mean(hp))
-#' filter(by_cyl, disp == max(disp))
+#' by_cyl <- mtcars %>% group_by(cyl)
+#'
+#' # grouping doesn't change how the data looks (apart from listing
+#' # how it's grouped):
+#' by_cyl
#'
-#' # summarise peels off a single layer of grouping
-#' by_vs_am <- group_by(mtcars, vs, am)
-#' by_vs <- summarise(by_vs_am, n = n())
+#' # It changes how it acts with the other dplyr verbs:
+#' by_cyl %>% summarise(
+#' disp = mean(disp),
+#' hp = mean(hp)
+#' )
+#' by_cyl %>% filter(disp == max(disp))
+#'
+#' # Each call to summarise() removes a layer of grouping
+#' by_vs_am <- mtcars %>% group_by(vs, am)
+#' by_vs <- by_vs_am %>% summarise(n = n())
#' by_vs
-#' summarise(by_vs, n = sum(n))
-#' # use ungroup() to remove if not wanted
-#' summarise(ungroup(by_vs), n = sum(n))
+#' by_vs %>% summarise(n = sum(n))
+#'
+#' # To removing grouping, use ungroup
+#' by_vs %>%
+#' ungroup() %>%
+#' summarise(n = sum(n))
#'
#' # You can group by expressions: this is just short-hand for
#' # a mutate/rename followed by a simple group_by
-#' group_by(mtcars, vsam = vs + am)
-#' group_by(mtcars, vs2 = vs)
-#'
-#' # You can also group by a constant, but it's not very useful
-#' group_by(mtcars, "vs")
+#' mtcars %>% group_by(vsam = vs + am)
#'
-#' # By default, group_by sets groups. Use add = TRUE to add groups
-#' groups(group_by(by_cyl, vs, am))
-#' groups(group_by(by_cyl, vs, am, add = TRUE))
+#' # By default, group_by overrides existing grouping
+#' by_cyl %>%
+#' group_by(vs, am) %>%
+#' group_vars()
#'
-#' # Duplicate groups are silently dropped
-#' groups(group_by(by_cyl, cyl, cyl))
-#' @aliases regroup
+#' # Use add = TRUE to instead append
+#' by_cyl %>%
+#' group_by(vs, am, add = TRUE) %>%
+#' group_vars()
group_by <- function(.data, ..., add = FALSE) {
- group_by_(.data, .dots = lazyeval::lazy_dots(...), add = add)
+ UseMethod("group_by")
}
-
#' @export
-#' @rdname group_by
-group_by_ <- function(.data, ..., .dots, add = FALSE) {
+group_by.default <- function(.data, ..., add = FALSE) {
+ group_by_(.data, .dots = compat_as_lazy_dots(...), add = add)
+}
+#' @export
+#' @rdname se-deprecated
+#' @inheritParams group_by
+group_by_ <- function(.data, ..., .dots = list(), add = FALSE) {
UseMethod("group_by_")
}
+#' @rdname group_by
+#' @export
+#' @param x A [tbl()]
+ungroup <- function(x, ...) {
+ UseMethod("ungroup")
+}
+
#' Prepare for grouping.
#'
#' Performs standard operations that should happen before individual methods
@@ -79,56 +104,57 @@ group_by_ <- function(.data, ..., .dots, add = FALSE) {
#' \item{groups}{Modified groups}
#' @export
#' @keywords internal
-group_by_prepare <- function(.data, ..., .dots, add = FALSE) {
- new_groups <- lazyeval::all_dots(.dots, ...)
- new_groups <- resolve_vars(new_groups, tbl_vars(.data))
+group_by_prepare <- function(.data, ..., .dots = list(), add = FALSE) {
+ new_groups <- c(quos(...), compat_lazy_dots(.dots, caller_env()))
# If any calls, use mutate to add new columns, then group by those
- is_name <- vapply(new_groups, function(x) is.name(x$expr), logical(1))
- has_name <- names2(new_groups) != ""
+ is_symbol <- map_lgl(new_groups, quo_is_symbol)
+ named <- have_name(new_groups)
- needs_mutate <- has_name | !is_name
+ needs_mutate <- named | !is_symbol
if (any(needs_mutate)) {
- .data <- mutate_(.data, .dots = new_groups[needs_mutate])
+ .data <- mutate(.data, !!! new_groups[needs_mutate])
}
# Once we've done the mutate, we no longer need lazy objects, and
# can instead just use symbols
- new_groups <- lazyeval::auto_name(new_groups)
- groups <- lapply(names(new_groups), as.name)
+ new_groups <- exprs_auto_name(new_groups, printer = tidy_text)
+ group_names <- names(new_groups)
if (add) {
- groups <- c(groups(.data), groups)
+ group_names <- c(group_vars(.data), group_names)
}
- groups <- groups[!duplicated(groups)]
+ group_names <- unique(group_names)
- list(data = .data, groups = groups)
+ list(
+ data = .data,
+ groups = syms(group_names),
+ group_names = group_names
+ )
}
-#' Get/set the grouping variables for tbl.
+#' Return grouping variables
#'
-#' These functions do not perform non-standard evaluation, and so are useful
-#' when programming against \code{tbl} objects. \code{ungroup} is a convenient
-#' inline way of removing existing grouping.
+#' `group_vars()` returns a character vector; `groups()` returns a list of
+#' symbols.
#'
-#' @param x data \code{\link{tbl}}
-#' @param ... Additional arguments that maybe used by methods.
+#' @param x A [tbl()]
#' @export
#' @examples
-#' grouped <- group_by(mtcars, cyl)
-#' groups(grouped)
-#' groups(ungroup(grouped))
+#' df <- tibble(x = 1, y = 2) %>% group_by(x, y)
+#' group_vars(df)
+#' groups(df)
groups <- function(x) {
UseMethod("groups")
}
+#' @rdname groups
#' @export
-regroup <- function(x, value) {
- .Deprecated("group_by_")
- group_by_(x, .dots = value)
+group_vars <- function(x) {
+ UseMethod("group_vars")
}
#' @export
-#' @rdname groups
-ungroup <- function(x, ...) {
- UseMethod("ungroup")
+group_vars.default <- function(x) {
+ deparse_names(groups(x))
}
+
diff --git a/R/group-indices.R b/R/group-indices.R
index 5554595..0bf9490 100644
--- a/R/group-indices.R
+++ b/R/group-indices.R
@@ -2,7 +2,8 @@
#'
#' Generate a unique id for each group
#'
-#' @seealso \code{\link{group_by}}
+#' @keywords internal
+#' @seealso [group_by()]
#' @param .data a tbl
#' @inheritParams group_by
#' @inheritParams filter
@@ -10,26 +11,41 @@
#' @examples
#' group_indices(mtcars, cyl)
group_indices <- function(.data, ...) {
- group_indices_(.data, .dots = lazyeval::lazy_dots(...) )
+ UseMethod("group_indices")
+}
+#' @export
+group_indices.default <- function(.data, ...) {
+ group_indices_(.data, .dots = compat_as_lazy_dots(...))
}
-
#' @export
-#' @rdname group_indices
-group_indices_ <- function(.data, ..., .dots) {
+#' @rdname se-deprecated
+group_indices_ <- function(.data, ..., .dots = list()) {
UseMethod("group_indices_")
}
#' @export
-group_indices_.data.frame <- function(.data, ..., .dots ){
- groups <- group_by_prepare(.data, .dots = .dots )
- grouped_indices_impl(groups$data, groups$groups)
+group_indices.data.frame <- function(.data, ...) {
+ dots <- quos(...)
+ if (length(dots) == 0L) {
+ return(rep(1L, nrow(.data)))
+ }
+ grouped_indices_grouped_df_impl(group_by(.data, !!! dots))
+}
+#' @export
+group_indices_.data.frame <- function(.data, ..., .dots = list()) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
+ group_indices(.data, !!! dots)
}
#' @export
-group_indices_.grouped_df <- function(.data, ..., .dots ){
- if( length(list(...)) || ( ! missing(.dots) && length(.dots) ) ){
- warning( "group_indices_.grouped_df ignores extra arguments" )
+group_indices.grouped_df <- function(.data, ...) {
+ if (length(list(...))) {
+ warn("group_indices_.grouped_df ignores extra arguments")
}
grouped_indices_grouped_df_impl(.data)
}
-
+#' @export
+group_indices_.grouped_df <- function(.data, ..., .dots = list()) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
+ group_indices(.data, !!! dots)
+}
diff --git a/R/group-size.r b/R/group-size.r
index a4c643c..708088b 100644
--- a/R/group-size.r
+++ b/R/group-size.r
@@ -13,6 +13,7 @@
#' n_groups(by_dest)
#' group_size(by_dest)
#' }
+#' @keywords internal
group_size <- function(x) UseMethod("group_size")
#' @export
diff --git a/R/grouped-df.r b/R/grouped-df.r
index 9b656a7..a1fbd48 100644
--- a/R/grouped-df.r
+++ b/R/grouped-df.r
@@ -1,36 +1,46 @@
#' A grouped data frame.
#'
-#' The easiest way to create a grouped data frame is to call the \code{group_by}
+#' The easiest way to create a grouped data frame is to call the `group_by()`
#' method on a data frame or tbl: this will take care of capturing
-#' the unevalated expressions for you.
+#' the unevaluated expressions for you.
#'
#' @keywords internal
#' @param data a tbl or data frame.
-#' @param vars a list of quoted variables.
-#' @param drop if \code{TRUE} preserve all factor levels, even those without
+#' @param vars a character vector or a list of [name()]
+#' @param drop if `TRUE` preserve all factor levels, even those without
#' data.
#' @export
grouped_df <- function(data, vars, drop = TRUE) {
if (length(vars) == 0) {
return(tbl_df(data))
}
- assert_that(is.data.frame(data), is.list(vars), all(sapply(vars,is.name)), is.flag(drop))
+ assert_that(
+ is.data.frame(data),
+ (is.list(vars) && all(sapply(vars,is.name))) || is.character(vars),
+ is.flag(drop)
+ )
+ if (is.list(vars)) {
+ vars <- deparse_names(vars)
+ }
grouped_df_impl(data, unname(vars), drop)
}
+setOldClass(c("grouped_df", "tbl_df", "tbl", "data.frame"))
+
#' @rdname grouped_df
#' @export
is.grouped_df <- function(x) inherits(x, "grouped_df")
-
+#' @rdname grouped_df
#' @export
-print.grouped_df <- function(x, ..., n = NULL, width = NULL) {
- cat("Source: local data frame ", dim_desc(x), "\n", sep = "")
+is_grouped_df <- is.grouped_df
+#' @export
+tbl_sum.grouped_df <- function(x) {
grps <- if (is.null(attr(x, "indices"))) "?" else length(attr(x, "indices"))
- cat("Groups: ", commas(deparse_all(groups(x))), " [", big_mark(grps), "]\n", sep = "")
- cat("\n")
- print(trunc_mat(x, n = n, width = width), ...)
- invisible(x)
+ c(
+ NextMethod(),
+ c("Groups" = paste0(commas(group_vars(x)), " [", big_mark(grps), "]"))
+ )
}
#' @export
@@ -45,7 +55,15 @@ n_groups.grouped_df <- function(x) {
#' @export
groups.grouped_df <- function(x) {
- attr(x, "vars")
+ syms(group_vars(x))
+}
+
+#' @export
+group_vars.grouped_df <- function(x) {
+ vars <- attr(x, "vars")
+ # Need this for compatibility with existing packages that might
+ if (is.list(vars)) vars <- map_chr(vars, as_string)
+ vars
}
#' @export
@@ -57,6 +75,13 @@ as.data.frame.grouped_df <- function(x, row.names = NULL,
}
#' @export
+as_data_frame.grouped_df <- function(x, ...) {
+ x <- ungroup(x)
+ class(x) <- c("tbl_df", "tbl", "data.frame")
+ x
+}
+
+#' @export
ungroup.grouped_df <- function(x, ...) {
ungroup_grouped_df(x)
}
@@ -65,12 +90,12 @@ ungroup.grouped_df <- function(x, ...) {
`[.grouped_df` <- function(x, i, j, ...) {
y <- NextMethod()
- group_vars <- vapply(groups(x), as.character, character(1))
+ group_names <- group_vars(x)
- if (!all(group_vars %in% names(y))) {
+ if (!all(group_names %in% names(y))) {
tbl_df(y)
} else {
- grouped_df(y, groups(x))
+ grouped_df(y, group_names)
}
}
@@ -89,85 +114,102 @@ cbind.grouped_df <- function(...) {
# One-table verbs --------------------------------------------------------------
-#' @export
-select_.grouped_df <- function(.data, ..., .dots) {
- dots <- lazyeval::all_dots(.dots, ...)
- vars <- select_vars_(names(.data), dots)
- vars <- ensure_grouped_vars(vars, .data)
+# see arrange.r for arrange.grouped_df
+#' @export
+select.grouped_df <- function(.data, ...) {
+ # Pass via splicing to avoid matching select_vars() arguments
+ vars <- select_vars(names(.data), !!! quos(...))
+ vars <- ensure_group_vars(vars, .data)
select_impl(.data, vars)
}
+#' @export
+select_.grouped_df <- function(.data, ..., .dots = list()) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
+ select.grouped_df(.data, !!! dots)
+}
-ensure_grouped_vars <- function(vars, data, notify = TRUE) {
- group_names <- vapply(groups(data), as.character, character(1))
+ensure_group_vars <- function(vars, data, notify = TRUE) {
+ group_names <- group_vars(data)
missing <- setdiff(group_names, vars)
if (length(missing) > 0) {
if (notify) {
- message("Adding missing grouping variables: ",
- paste0("`", missing, "`", collapse = ", "))
+ inform(glue(
+ "Adding missing grouping variables: ",
+ paste0("`", missing, "`", collapse = ", ")
+ ))
}
- vars <- c(stats::setNames(missing, missing), vars)
+ vars <- c(set_names(missing, missing), vars)
}
vars
}
#' @export
-rename_.grouped_df <- function(.data, ..., .dots) {
- dots <- lazyeval::all_dots(.dots, ...)
- vars <- rename_vars_(names(.data), dots)
-
+rename.grouped_df <- function(.data, ...) {
+ vars <- rename_vars(names(.data), !!! quos(...))
select_impl(.data, vars)
}
+#' @export
+rename_.grouped_df <- function(.data, ..., .dots = list()) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
+ rename(.data, !!! dots)
+}
# Do ---------------------------------------------------------------------------
-
#' @export
-do_.grouped_df <- function(.data, ..., env = parent.frame(), .dots) {
+do.grouped_df <- function(.data, ...) {
# Force computation of indices
- if (is.null(attr(.data, "indices"))) {
- .data <- grouped_df_impl(.data, attr(.data, "vars"),
- attr(.data, "drop") %||% TRUE)
+ if (is_null(attr(.data, "indices"))) {
+ .data <- grouped_df_impl(
+ .data, attr(.data, "vars"),
+ attr(.data, "drop") %||% TRUE
+ )
}
+ index <- attr(.data, "indices")
+ labels <- attr(.data, "labels")
# Create ungroup version of data frame suitable for subsetting
group_data <- ungroup(.data)
- args <- lazyeval::all_dots(.dots, ...)
+ args <- quos(...)
named <- named_args(args)
- env <- new.env(parent = lazyeval::common_env(args))
- labels <- attr(.data, "labels")
+ env <- child_env(NULL)
- index <- attr(.data, "indices")
n <- length(index)
m <- length(args)
# Special case for zero-group/zero-row input
if (n == 0) {
- env$. <- group_data
-
- if (!named) {
- out <- eval(args[[1]]$expr, envir = env)[0, , drop = FALSE]
- return(label_output_dataframe(labels, list(list(out)), groups(.data)))
+ if (named) {
+ out <- rep_len(list(list()), length(args))
+ out <- set_names(out, names(args))
+ out <- label_output_list(labels, out, groups(.data))
} else {
- out <- setNames(rep(list(list()), length(args)), names(args))
- return(label_output_list(labels, out, groups(.data)))
+ env_bind(.env = env, . = group_data, .data = group_data)
+ out <- eval_tidy_(args[[1]], env)[0, , drop = FALSE]
+ out <- label_output_dataframe(labels, list(list(out)), groups(.data))
}
+ return(out)
}
- # Create new environment, inheriting from parent, with an active binding
- # for . that resolves to the current subset. `_i` is found in environment
- # of this function because of usual scoping rules.
- makeActiveBinding(env = env, ".", function(value) {
+ # Add pronouns with active bindings that resolve to the current
+ # subset. `_i` is found in environment of this function because of
+ # usual scoping rules.
+ group_slice <- function(value) {
if (missing(value)) {
group_data[index[[`_i`]] + 1L, , drop = FALSE]
} else {
group_data[index[[`_i`]] + 1L, ] <<- value
}
- })
+ }
+ env_bind_fns(.env = env, . = group_slice, .data = group_slice)
+
+ overscope <- new_overscope(env)
+ on.exit(overscope_clean(overscope))
out <- replicate(m, vector("list", n), simplify = FALSE)
names(out) <- names(args)
@@ -175,7 +217,7 @@ do_.grouped_df <- function(.data, ..., env = parent.frame(), .dots) {
for (`_i` in seq_len(n)) {
for (j in seq_len(m)) {
- out[[j]][`_i`] <- list(eval(args[[j]]$expr, envir = env))
+ out[[j]][`_i`] <- list(overscope_eval_next(overscope, args[[j]]))
p$tick()$print()
}
}
@@ -186,66 +228,95 @@ do_.grouped_df <- function(.data, ..., env = parent.frame(), .dots) {
label_output_list(labels, out, groups(.data))
}
}
+#' @export
+do_.grouped_df <- function(.data, ..., env = caller_env(), .dots = list()) {
+ dots <- compat_lazy_dots(.dots, env, ...)
+ do(.data, !!! dots)
+}
# Set operations ---------------------------------------------------------------
#' @export
-distinct_.grouped_df <- function(.data, ..., .dots, .keep_all = FALSE) {
- groups <- lazyeval::as.lazy_dots(groups(.data))
- dist <- distinct_vars(.data, ..., .dots = c(.dots, groups),
- .keep_all = .keep_all)
-
+distinct.grouped_df <- function(.data, ..., .keep_all = FALSE) {
+ dist <- distinct_vars(
+ .data,
+ vars = named_quos(...),
+ group_vars = group_vars(.data),
+ .keep_all = .keep_all
+ )
grouped_df(distinct_impl(dist$data, dist$vars, dist$keep), groups(.data))
}
+#' @export
+distinct_.grouped_df <- function(.data, ..., .dots = list(), .keep_all = FALSE) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
+ distinct(.data, !!! dots, .keep_all = .keep_all)
+}
# Random sampling --------------------------------------------------------------
#' @export
-sample_n.grouped_df <- function(tbl, size, replace = FALSE, weight = NULL,
- .env = parent.frame()) {
+sample_n.grouped_df <- function(tbl, size, replace = FALSE,
+ weight = NULL, .env = NULL) {
- assert_that(is.numeric(size), length(size) == 1, size >= 0)
- weight <- substitute(weight)
+ assert_that(is_scalar_integerish(size), size >= 0)
+ if (!is_null(.env)) {
+ inform("`.env` is deprecated and no longer has any effect")
+ }
+ weight <- enquo(weight)
index <- attr(tbl, "indices")
- sampled <- lapply(index, sample_group, frac = FALSE,
- tbl = tbl, size = size, replace = replace, weight = weight, .env = .env)
+ sampled <- lapply(index, sample_group,
+ frac = FALSE,
+ tbl = tbl,
+ size = size,
+ replace = replace,
+ weight = weight
+ )
idx <- unlist(sampled) + 1
grouped_df(tbl[idx, , drop = FALSE], vars = groups(tbl))
}
#' @export
-sample_frac.grouped_df <- function(tbl, size = 1, replace = FALSE, weight = NULL,
- .env = parent.frame()) {
-
+sample_frac.grouped_df <- function(tbl, size = 1, replace = FALSE,
+ weight = NULL, .env = NULL) {
assert_that(is.numeric(size), length(size) == 1, size >= 0)
+ if (!is_null(.env)) {
+ inform("`.env` is deprecated and no longer has any effect")
+ }
if (size > 1 && !replace) {
- stop("Sampled fraction can't be greater than one unless replace = TRUE",
- call. = FALSE)
+ bad_args("size", "of sampled fraction must be less or equal to one, ",
+ "set `replace` = TRUE to use sampling with replacement"
+ )
}
- weight <- substitute(weight)
+ weight <- enquo(weight)
index <- attr(tbl, "indices")
- sampled <- lapply(index, sample_group, frac = TRUE,
- tbl = tbl, size = size, replace = replace, weight = weight, .env = .env)
+ sampled <- lapply(index, sample_group,
+ frac = TRUE,
+ tbl = tbl,
+ size = size,
+ replace = replace,
+ weight = weight
+ )
idx <- unlist(sampled) + 1
grouped_df(tbl[idx, , drop = FALSE], vars = groups(tbl))
}
-sample_group <- function(tbl, i, frac = FALSE, size, replace = TRUE,
- weight = NULL, .env = parent.frame()) {
+sample_group <- function(tbl, i, frac, size, replace, weight) {
n <- length(i)
- if (frac) size <- round(size * n)
-
- check_size(size, n, replace)
+ if (frac) {
+ check_frac(size, replace)
+ size <- round(size * n)
+ } else {
+ check_size(size, n, replace)
+ }
- # weight use standard evaluation in this function
- if (!is.null(weight)) {
- weight <- eval(weight, tbl[i + 1, , drop = FALSE], .env)
+ weight <- eval_tidy(weight, tbl[i + 1, , drop = FALSE])
+ if (!is_null(weight)) {
weight <- check_weight(weight, n)
}
diff --git a/R/hybrid.R b/R/hybrid.R
new file mode 100644
index 0000000..89853af
--- /dev/null
+++ b/R/hybrid.R
@@ -0,0 +1,61 @@
+verify_hybrid <- function(x) {
+ abort("Not in hybrid evaluation")
+}
+
+verify_not_hybrid <- function(x) {
+ x
+}
+
+with_hybrid <- function(expr, ...) {
+ with_hybrid_(enquo(expr), ...)
+}
+
+with_hybrid_ <- function(expr, ...) {
+ stopifnot(any(class(expr) == "formula"))
+ expr[[2]] <- prepend_call(expr[[2]], "verify_hybrid")
+ data <- data_frame(...)
+
+ # Make verify_hybrid() available to the evaluated expression
+ eval_env <- new.env(parent = environment(expr))
+ eval_env$verify_hybrid <- verify_hybrid
+ environment(expr) <- eval_env
+
+ summarise(data, out = !! expr)["out"][[1]]
+}
+
+without_hybrid <- function(expr, ...) {
+ without_hybrid_(enquo(expr), ...)
+}
+
+without_hybrid_ <- function(expr, ...) {
+ stopifnot(any(class(expr) == "formula"))
+ expr[[2]] <- prepend_call(expr[[2]], "verify_not_hybrid")
+ data <- data_frame(...)
+
+ # Make verify_not_hybrid() available to the evaluated expression
+ eval_env <- new.env(parent = environment(expr))
+ eval_env$verify_not_hybrid <- verify_not_hybrid
+ environment(expr) <- eval_env
+
+ summarise(data, out = !! expr)["out"][[1]]
+}
+
+eval_dots <- function(expr, ...) {
+ eval_dots_(enquo(expr), ...)
+}
+
+eval_dots_ <- function(expr, ...) {
+ data <- data_frame(...)
+ eval(expr[[2]], data, enclos = environment(expr))
+}
+
+# some(func()) -> name(some(func()))
+# list(some(func())) -> list(name(some(func())))
+prepend_call <- function(expr, name) {
+ if (is.call(expr) && expr[[1]] == quote(list)) {
+ stopifnot(length(expr) == 2L)
+ call("list", call(name, expr[[2]]))
+ } else {
+ call(name, expr)
+ }
+}
diff --git a/R/id.r b/R/id.r
index 5af8120..0874f5f 100644
--- a/R/id.r
+++ b/R/id.r
@@ -2,19 +2,19 @@
#'
#' Properties:
#' \itemize{
-#' \item \code{order(id)} is equivalent to \code{do.call(order, df)}
+#' \item `order(id)` is equivalent to `do.call(order, df)`
#' \item rows containing the same data have the same value
-#' \item if \code{drop = FALSE} then room for all possibilites
+#' \item if `drop = FALSE` then room for all possibilities
#' }
#'
#' @param .variables list of variables
-#' @param drop drop unusued factor levels?
+#' @param drop drop unused factor levels?
#' @return a numeric vector with attribute n, giving total number of
#' possibilities
#' @keywords internal
#' @export
id <- function(.variables, drop = FALSE) {
- warning("id() is deprecated", call. = FALSE)
+ warn("`id()` is deprecated")
# Drop all zero length inputs
lengths <- vapply(.variables, length, integer(1))
@@ -35,8 +35,7 @@ id <- function(.variables, drop = FALSE) {
p <- length(ids)
# Calculate dimensions
- ndistinct <- vapply(ids, attr, "n", FUN.VALUE = numeric(1),
- USE.NAMES = FALSE)
+ ndistinct <- vapply(ids, attr, "n", FUN.VALUE = numeric(1), USE.NAMES = FALSE)
n <- prod(ndistinct)
if (n > 2 ^ 31) {
# Too big for integers, have to use strings, which will be much slower :(
diff --git a/R/if_else.R b/R/if_else.R
index 1d106bf..6a144a6 100644
--- a/R/if_else.R
+++ b/R/if_else.R
@@ -1,21 +1,21 @@
-#' Vectorised if.
+#' Vectorised if
#'
-#' Compared to the base \code{\link{ifelse}()}, this function is more strict.
-#' It checks that \code{true} and \code{false} are the same type. This
+#' Compared to the base [ifelse()], this function is more strict.
+#' It checks that `true` and `false` are the same type. This
#' strictness makes the output type more predictable, and makes it somewhat
#' faster.
#'
#' @param condition Logical vector
-#' @param true,false Values to use for \code{TRUE} and \code{FALSE} values of
-#' \code{condition}. They must be either the same length as \code{condition},
-#' or length 1. They must also be the same type: \code{if_else} checks that
+#' @param true,false Values to use for `TRUE` and `FALSE` values of
+#' `condition`. They must be either the same length as `condition`,
+#' or length 1. They must also be the same type: `if_else()` checks that
#' they have the same type and same class. All other attributes are
-#' taken from \code{true}.
-#' @param missing If not \code{NULL}, will be used to replace missing
+#' taken from `true`.
+#' @param missing If not `NULL`, will be used to replace missing
#' values.
-#' @return Where \code{condition} is \code{TRUE}, the matching value from
-#' \code{true}, where it's \code{FALSE}, the matching value from \code{false},
-#' otherwise \code{NA}.
+#' @return Where `condition` is `TRUE`, the matching value from
+#' `true`, where it's `FALSE`, the matching value from `false`,
+#' otherwise `NA`.
#' @export
#' @examples
#' x <- c(-5:5, NA)
@@ -29,13 +29,25 @@
#' # Attributes are taken from the `true` vector,
if_else <- function(condition, true, false, missing = NULL) {
if (!is.logical(condition)) {
- stop("`condition` must be logical", call. = FALSE)
+ bad_args("condition", "must be a logical, not {type_of(condition)}")
}
out <- true[rep(NA_integer_, length(condition))]
- out <- replace_with(out, condition & !is.na(condition), true, "`true`")
- out <- replace_with(out, !condition & !is.na(condition), false, "`false`")
- out <- replace_with(out, is.na(condition), missing, "`missing`")
+ out <- replace_with(
+ out, condition, true,
+ fmt_args(~true),
+ glue("length of {fmt_args(~condition)}")
+ )
+ out <- replace_with(
+ out, !condition, false,
+ fmt_args(~false),
+ glue("length of {fmt_args(~condition)}")
+ )
+ out <- replace_with(
+ out, is.na(condition), missing,
+ fmt_args(~missing),
+ glue("length of {fmt_args(~condition)}")
+ )
out
}
diff --git a/R/inline.r b/R/inline.r
index 59988ed..dd2050e 100644
--- a/R/inline.r
+++ b/R/inline.r
@@ -1,6 +1,5 @@
inlineCxxPlugin <- Rcpp.plugin.maker(
- include.before = "#include <dplyr.h>",
- package = "dplyr",
- LinkingTo = c("Rcpp", "BH", "dplyr")
+ include.before = "#include <dplyr.h>",
+ package = "dplyr",
+ LinkingTo = c("Rcpp", "BH", "dplyr")
)
-
diff --git a/R/join.r b/R/join.r
index ca29238..94e9961 100644
--- a/R/join.r
+++ b/R/join.r
@@ -1,67 +1,85 @@
-#' Join two tbls together.
+#' Join two tbls together
#'
#' These are generic functions that dispatch to individual tbl methods - see the
-#' method documentation for details of individual data sources. \code{x} and
-#' \code{y} should usually be from the same data source, but if \code{copy} is
-#' \code{TRUE}, \code{y} will automatically be copied to the same source as
-#' \code{x} - this may be an expensive operation.
+#' method documentation for details of individual data sources. `x` and
+#' `y` should usually be from the same data source, but if `copy` is
+#' `TRUE`, `y` will automatically be copied to the same source as `x`.
#'
#' @section Join types:
#'
#' Currently dplyr supports four join types:
#'
#' \describe{
-#' \item{\code{inner_join}}{return all rows from \code{x} where there are matching
-#' values in \code{y}, and all columns from \code{x} and \code{y}. If there are multiple matches
-#' between \code{x} and \code{y}, all combination of the matches are returned.}
+#' \item{`inner_join()`}{return all rows from `x` where there are matching
+#' values in `y`, and all columns from `x` and `y`. If there are multiple matches
+#' between `x` and `y`, all combination of the matches are returned.}
#'
-#' \item{\code{left_join}}{return all rows from \code{x}, and all columns from \code{x}
-#' and \code{y}. Rows in \code{x} with no match in \code{y} will have \code{NA} values in the new
-#' columns. If there are multiple matches between \code{x} and \code{y}, all combinations
+#' \item{`left_join()`}{return all rows from `x`, and all columns from `x`
+#' and `y`. Rows in `x` with no match in `y` will have `NA` values in the new
+#' columns. If there are multiple matches between `x` and `y`, all combinations
#' of the matches are returned.}
#'
-#' \item{\code{right_join}}{return all rows from \code{y}, and all columns from \code{x}
-#' and y. Rows in \code{y} with no match in \code{x} will have \code{NA} values in the new
-#' columns. If there are multiple matches between \code{x} and \code{y}, all combinations
+#' \item{`right_join()`}{return all rows from `y`, and all columns from `x`
+#' and y. Rows in `y` with no match in `x` will have `NA` values in the new
+#' columns. If there are multiple matches between `x` and `y`, all combinations
#' of the matches are returned.}
#'
-#' \item{\code{semi_join}}{return all rows from \code{x} where there are matching
-#' values in \code{y}, keeping just columns from \code{x}.
+#' \item{`semi_join()`}{return all rows from `x` where there are matching
+#' values in `y`, keeping just columns from `x`.
#'
#' A semi join differs from an inner join because an inner join will return
-#' one row of \code{x} for each matching row of \code{y}, where a semi
-#' join will never duplicate rows of \code{x}.}
+#' one row of `x` for each matching row of `y`, where a semi
+#' join will never duplicate rows of `x`.}
#'
-#' \item{\code{anti_join}}{return all rows from \code{x} where there are not
-#' matching values in \code{y}, keeping just columns from \code{x}.}
+#' \item{`anti_join()`}{return all rows from `x` where there are not
+#' matching values in `y`, keeping just columns from `x`.}
#'
-#' \item{\code{full_join}}{return all rows and all columns from both \code{x} and \code{y}.
-#' Where there are not matching values, returns \code{NA} for the one missing.}
+#' \item{`full_join()`}{return all rows and all columns from both `x` and `y`.
+#' Where there are not matching values, returns `NA` for the one missing.}
#' }
#'
#' @section Grouping:
#'
#' Groups are ignored for the purpose of joining, but the result preserves
-#' the grouping of \code{x}.
+#' the grouping of `x`.
#'
#' @param x,y tbls to join
-#' @param by a character vector of variables to join by. If \code{NULL}, the
-#' default, \code{join} will do a natural join, using all variables with
+#' @param by a character vector of variables to join by. If `NULL`, the
+#' default, `*_join()` will do a natural join, using all variables with
#' common names across the two tables. A message lists the variables so
#' that you can check they're right (to suppress the message, simply
#' explicitly list the variables that you want to join).
#'
#' To join by different variables on x and y use a named vector.
-#' For example, \code{by = c("a" = "b")} will match \code{x.a} to
-#' \code{y.b}.
-#' @param copy If \code{x} and \code{y} are not from the same data source,
-#' and \code{copy} is \code{TRUE}, then \code{y} will be copied into the
-#' same src as \code{x}. This allows you to join tables across srcs, but
+#' For example, `by = c("a" = "b")` will match `x.a` to
+#' `y.b`.
+#' @param copy If `x` and `y` are not from the same data source,
+#' and `copy` is `TRUE`, then `y` will be copied into the
+#' same src as `x`. This allows you to join tables across srcs, but
#' it is a potentially expensive operation so you must opt into it.
-#' @param suffix If there are non-joined duplicate variables in \code{x} and
-#' \code{y}, these suffixes will be added to the output to diambiguate them.
+#' @param suffix If there are non-joined duplicate variables in `x` and
+#' `y`, these suffixes will be added to the output to disambiguate them.
+#' Should be a character vector of length 2.
#' @param ... other parameters passed onto methods
#' @name join
+#' @examples
+#' # "Mutating" joins add variables to the LHS
+#' band_members %>% inner_join(band_instruments)
+#' band_members %>% left_join(band_instruments)
+#' band_members %>% right_join(band_instruments)
+#' band_members %>% full_join(band_instruments)
+#'
+#' # "Filtering" joins keep cases from the LHS
+#' band_members %>% semi_join(band_instruments)
+#' band_members %>% anti_join(band_instruments)
+#'
+#' # To suppress the message, supply by
+#' band_members %>% inner_join(band_instruments, by = "name")
+#' # This is good practice in production code
+#'
+#' # Use a named `by` if the join variables have different names
+#' band_members %>% full_join(band_instruments2, by = c("name" = "artist"))
+#' # Note that only the key from the LHS is kept
NULL
#' @rdname join
@@ -104,25 +122,51 @@ anti_join <- function(x, y, by = NULL, copy = FALSE, ...) {
#'
#' @export
#' @keywords internal
-common_by <- function(by = NULL, x, y) {
- if (is.list(by)) return(by)
+common_by <- function(by = NULL, x, y) UseMethod("common_by", by)
+
+#' @export
+common_by.character <- function(by, x, y) {
+ by <- common_by_from_vector(by)
+ common_by.list(by, x, y)
+}
- if (!is.null(by)) {
- by <- by[!duplicated(by)]
- x <- names(by) %||% by
- y <- unname(by)
+common_by_from_vector <- function(by) {
+ by <- by[!duplicated(by)]
+ by_x <- names(by) %||% by
+ by_y <- unname(by)
- # If x partially named, assume unnamed are the same in both tables
- x[x == ""] <- y[x == ""]
+ # If x partially named, assume unnamed are the same in both tables
+ by_x[by_x == ""] <- by_y[by_x == ""]
+
+ list(x = by_x, y = by_y)
+}
+
+#' @export
+common_by.list <- function(by, x, y) {
+ x_vars <- tbl_vars(x)
+ if (!all(by$x %in% x_vars)) {
+ bad_args("by", "can't contain join column {missing} which is missing from LHS",
+ missing = fmt_obj(setdiff(by$x, x_vars))
+ )
+ }
- return(list(x = x, y = y))
+ y_vars <- tbl_vars(y)
+ if (!all(by$y %in% y_vars)) {
+ bad_args("by", "can't contain join column {missing} which is missing from RHS",
+ missing = fmt_obj(setdiff(by$y, y_vars))
+ )
}
+ by
+}
+
+#' @export
+common_by.NULL <- function(by, x, y) {
by <- intersect(tbl_vars(x), tbl_vars(y))
if (length(by) == 0) {
- stop("No common variables. Please specify `by` param.", call. = FALSE)
+ bad_args("by", "required, because the data sources have no common variables")
}
- message("Joining, by = ", utils::capture.output(dput(by)))
+ inform(auto_by_msg(by))
list(
x = by,
@@ -130,28 +174,28 @@ common_by <- function(by = NULL, x, y) {
)
}
-# Returns NULL if variables don't need to be renamed
-unique_names <- function(x_names, y_names, by, suffix = c(".x", ".y")) {
-
- common <- setdiff(intersect(x_names, y_names), by$x[by$x == by$y])
- if (length(common) == 0) return(NULL)
-
- suffix <- check_suffix(suffix)
-
- x_match <- match(common, x_names)
- x_new <- x_names
- x_new[x_match] <- paste0(x_names[x_match], suffix$x)
-
- y_match <- match(common, y_names)
- y_new <- y_names
- y_new[y_match] <- paste0(y_names[y_match], suffix$y)
+auto_by_msg <- function(by) {
+ by_quoted <- encodeString(by, quote = '"')
+ if (length(by_quoted) == 1L) {
+ by_code <- by_quoted
+ } else {
+ by_code <- paste0("c(", paste(by_quoted, collapse = ", "), ")")
+ }
+ paste0("Joining, by = ", by_code)
+}
- list(x = setNames(x_new, x_names), y = setNames(y_new, y_names))
+#' @export
+common_by.default <- function(by, x, y) {
+ bad_args("by", "must be a (named) character vector, list, or NULL for ",
+ "natural joins (not recommended in production code), not {type_of(by)}"
+ )
}
check_suffix <- function(x) {
if (!is.character(x) || length(x) != 2) {
- stop("`suffix` must be a character vector of length 2.", call. = FALSE)
+ bad_args("suffix", "must be a character vector of length 2, ",
+ "not {type_of(x)} of length {length(x)}"
+ )
}
list(x = x[1], y = x[2])
diff --git a/R/lazy-ops.R b/R/lazy-ops.R
deleted file mode 100644
index 6535419..0000000
--- a/R/lazy-ops.R
+++ /dev/null
@@ -1,244 +0,0 @@
-#' Lazy operations
-#'
-#' This set of S3 classes describe the action of dplyr verbs. These are
-#' currently used for SQL sources to separate the description of operations
-#' in R from their computation in SQL. This API is very new so is likely
-#' to evolve in the future.
-#'
-#' \code{op_vars} and \code{op_grps} compute the variables and groups from
-#' a sequence of lazy operations. \code{op_sort} tracks the order of the
-#' data for use in window functions.
-#'
-#' @keywords internal
-#' @name lazy_ops
-NULL
-
-op_base_remote <- function(src, x, vars = NULL) {
- # If not literal sql, must be a table identifier
- if (!is.sql(x)) {
- x <- ident(x)
- }
-
- if (is.null(vars)) {
- vars <- db_query_fields(src$con, x)
- }
- op_base("remote", src, x, vars)
-}
-
-#' @export
-print.op_base_remote <- function(x, ...) {
- cat("Source: ", src_desc(x$src), "\n", sep = "")
-
- if (inherits(x$x, "ident")) {
- cat("From: ", x$x, "\n", sep = "")
- } else {
- cat("From: <derived table>\n")
- }
-
- cat("<Table: ", x$x, ">\n", sep = "")
-}
-
-op_base_local <- function(df, env = parent.frame()) {
- op_base("local", src_df(env = env), df, names(df))
-}
-
-#' @export
-print.op_base_local <- function(x, ...) {
- cat("<Local data frame> ", dim_desc(x$x), "\n", sep = "")
-}
-
-#' @export
-#' @rdname lazy_ops
-op_base <- function(name, src, x, vars) {
- stopifnot(is.character(vars))
-
- structure(
- list(
- src = src,
- x = x,
- vars = vars
- ),
- class = c(paste0("op_base_", name), "op_base", "op")
- )
-
-}
-
-#' @export
-#' @rdname lazy_ops
-op_single <- function(name, x, dots = list(), args = list()) {
- structure(
- list(
- name = name,
- x = x,
- dots = dots,
- args = args
- ),
- class = c(paste0("op_", name), "op_single", "op")
- )
-}
-
-#' @export
-#' @rdname lazy_ops
-add_op_single <- function(name, .data, dots = list(), args = list()) {
- .data$ops <- op_single(name, x = .data$ops, dots = dots, args = args)
- .data
-}
-
-#' @export
-print.op_single <- function(x, ...) {
- print(x$x)
-
- cat("-> ", x$name, "()\n", sep = "")
- for (dot in x$dots) {
- cat(" - ", deparse_trunc(dot$expr), "\n", sep = "")
- }
-}
-
-#' @export
-#' @rdname lazy_ops
-op_double <- function(name, x, y, args = list()) {
- structure(
- list(
- name = name,
- x = x,
- y = y,
- args = args
- ),
- class = c(paste0("op_", name), "op_double", "op")
- )
-}
-
-# op_grps -----------------------------------------------------------------
-
-#' @export
-#' @rdname lazy_ops
-op_grps <- function(op) UseMethod("op_grps")
-#' @export
-op_grps.op_base <- function(op) character()
-#' @export
-op_grps.op_group_by <- function(op) {
- if (isTRUE(op$args$add)) {
- union(op_grps(op$x), names(op$dots))
- } else {
- names(op$dots)
- }
-}
-#' @export
-op_grps.op_ungroup <- function(op) {
- NULL
-}
-#' @export
-op_grps.op_summarise <- function(op) {
- grps <- op_grps(op$x)
- if (length(grps) == 1) {
- NULL
- } else {
- grps[-length(grps)]
- }
-}
-#' @export
-op_grps.op_single <- function(op) {
- op_grps(op$x)
-}
-#' @export
-op_grps.op_double <- function(op) {
- op_grps(op$x)
-}
-
-#' @export
-op_grps.tbl_lazy <- function(op) {
- op_grps(op$ops)
-}
-
-
-# op_vars -----------------------------------------------------------------
-
-#' @export
-#' @rdname lazy_ops
-op_vars <- function(op) UseMethod("op_vars")
-
-#' @export
-op_vars.op_base <- function(op) {
- op$vars
-}
-#' @export
-op_vars.op_select <- function(op) {
- names(select_vars_(op_vars(op$x), op$dots, include = op_grps(op$x)))
-}
-#' @export
-op_vars.op_rename <- function(op) {
- names(rename_vars_(op_vars(op$x), op$dots))
-}
-#' @export
-op_vars.op_summarise <- function(op) {
- c(op_grps(op$x), names(op$dots))
-}
-#' @export
-op_vars.op_mutate <- function(op) {
- unique(c(op_vars(op$x), names(op$dots)))
-}
-#' @export
-op_vars.op_single <- function(op) {
- op_vars(op$x)
-}
-#' @export
-op_vars.op_join <- function(op) {
- by <- op$args$by
- x_vars <- op_vars(op$x)
- y_vars <- op_vars(op$y)
-
- unique <- unique_names(x_vars, y_vars, by = by, suffix = op$args$suffix)
-
- if (is.null(unique)) {
- c(by$x, setdiff(x_vars, by$x), setdiff(y_vars, by$y))
- } else {
- union(unique$x, unique$y)
- }
-}
-#' @export
-op_vars.op_semi_join <- function(op) {
- op_vars(op$x)
-}
-#' @export
-op_vars.op_set_op <- function(op) {
- op_vars(op$x)
-}
-#' @export
-op_vars.tbl_lazy <- function(op) {
- op_vars(op$ops)
-}
-
-# op_sort -----------------------------------------------------------------
-
-# This is only used to determine the order for window functions
-# so it purposely ignores grouping.
-
-#' @export
-#' @rdname lazy_ops
-op_sort <- function(op) UseMethod("op_sort")
-#' @export
-op_sort.op_base <- function(op) NULL
-
-#' @export
-op_sort.op_summarise <- function(op) NULL
-
-#' @export
-op_sort.op_arrange <- function(op) {
- order_vars <- translate_sql_(op$dots, NULL, op_vars(op))
- c.sql(op_sort(op$x), order_vars, drop_null = TRUE)
-}
-
-#' @export
-op_sort.op_single <- function(op) {
- op_sort(op$x)
-}
-
-#' @export
-op_sort.op_double <- function(op) {
- op_sort(op$x)
-}
-
-#' @export
-op_sort.tbl_lazy <- function(op) {
- op_sort(op$ops)
-}
diff --git a/R/lead-lag.R b/R/lead-lag.R
index b2bae11..7ee12b0 100644
--- a/R/lead-lag.R
+++ b/R/lead-lag.R
@@ -1,12 +1,12 @@
#' Lead and lag.
#'
-#' Lead and lag are useful for comparing values offset by a constant (e.g. the
-#' previous or next value)
+#' Find the "next" or "previous" values in a vector. Useful for comparing values
+#' ahead of or behind the current values.
#'
#' @param x a vector of values
-#' @param n a postive integer of length 1, giving the number of positions to
+#' @param n a positive integer of length 1, giving the number of positions to
#' lead or lag by
-#' @param default value used for non-existant rows. Defaults to \code{NA}.
+#' @param default value used for non-existent rows. Defaults to `NA`.
#' @param order_by override the default ordering to use another vector
#' @param ... Needed for compatibility with lag generic.
#' @importFrom stats lag
@@ -39,8 +39,12 @@ lead <- function(x, n = 1L, default = NA, order_by = NULL, ...) {
return(with_order(order_by, lead, x, n = n, default = default))
}
+ if (length(n) != 1 || !is.numeric(n) || n < 0) {
+ bad_args("n", "must be a nonnegative integer scalar, ",
+ "not {type_of(n)} of length {length(n)}"
+ )
+ }
if (n == 0) return(x)
- if (n < 0 || length(n) > 1) stop("n must be a single positive integer")
xlen <- length(x)
n <- pmin(n, xlen)
@@ -57,8 +61,16 @@ lag <- function(x, n = 1L, default = NA, order_by = NULL, ...) {
return(with_order(order_by, lag, x, n = n, default = default))
}
+ if (inherits(x, "ts")) {
+ bad_args("x", "must be a vector, not a ts object, do you want `stats::lag()`?")
+ }
+
+ if (length(n) != 1 || !is.numeric(n) || n < 0) {
+ bad_args("n", "must be a nonnegative integer scalar, ",
+ "not {type_of(n)} of length {length(n)}"
+ )
+ }
if (n == 0) return(x)
- if (n < 0 || length(n) > 1) stop("n must be a single positive integer")
xlen <- length(x)
n <- pmin(n, xlen)
diff --git a/R/location.R b/R/location.R
index 39f191a..52c72eb 100644
--- a/R/location.R
+++ b/R/location.R
@@ -5,6 +5,7 @@
#'
#' @param df a data frame
#' @param x,y two data frames to compare
+#' @keywords internal
#' @export
#' @examples
#' location(mtcars)
diff --git a/R/manip.r b/R/manip.r
index 273fd34..d9247fa 100644
--- a/R/manip.r
+++ b/R/manip.r
@@ -1,47 +1,83 @@
-#' Return rows with matching conditions.
+#' Return rows with matching conditions
+#'
+#' Use `filter()` find rows/cases where conditions are true. Unlike
+#' base subsetting, rows where the condition evaluates to `NA` are dropped.
+#'
+#' Note that dplyr is not yet smart enough to optimise filtering optimisation
+#' on grouped datasets that don't need grouped calculations. For this reason,
+#' filtering is often considerably faster on [ungroup()]ed data.
+#'
+#' @section Useful filter functions:
+#'
+#' * [`==`], [`>`], [`>=`] etc
+#' * [`&`], [`|`], [`!`], [xor()]
+#' * [is.na()]
+#' * [between()], [near()]
+#'
+#' @section Tidy data:
+#' When applied to a data frame, row names are silently dropped. To preserve,
+#' convert to an explicit variable with [tibble::rownames_to_column()].
+#'
+#' @section Scoped filtering:
+#' The three [scoped] variants ([filter_all()], [filter_if()] and
+#' [filter_at()]) make it easy to apply a filtering condition to a
+#' selection of variables.
#'
#' @family single table verbs
#' @param .data A tbl. All main verbs are S3 generics and provide methods
-#' for \code{\link{tbl_df}}, \code{\link[dtplyr]{tbl_dt}} and \code{\link{tbl_sql}}.
-#' @param ... Logical predicates. Multiple conditions are combined with \code{&}.
-#' @param .dots Used to work around non-standard evaluation. See
-#' \code{vignette("nse")} for details.
-#' @return An object of the same class as \code{.data}.
-#'
-#' Data frame row names are silently dropped. To preserve, convert to an
-#' explicit variable.
+#' for [tbl_df()], [dtplyr::tbl_dt()] and [dbplyr::tbl_dbi()].
+#' @param ... Logical predicates defined in terms of the variables in `.data`.
+#' Multiple conditions are combined with `&`. Only rows where the
+#' condition evaluates to `TRUE` are kept.
+#'
+#' These arguments are automatically [quoted][rlang::quo] and
+#' [evaluated][rlang::eval_tidy] in the context of the data
+#' frame. They support [unquoting][rlang::quasiquotation] and
+#' splicing. See `vignette("programming")` for an introduction to
+#' these concepts.
+#' @return An object of the same class as `.data`.
+#' @seealso [filter_all()], [filter_if()] and [filter_at()].
#' @export
#' @examples
-#' filter(mtcars, cyl == 8)
-#' filter(mtcars, cyl < 6)
+#' filter(starwars, species == "Human")
+#' filter(starwars, mass > 1000)
#'
#' # Multiple criteria
-#' filter(mtcars, cyl < 6 & vs == 1)
-#' filter(mtcars, cyl < 6 | vs == 1)
+#' filter(starwars, hair_color == "none" & eye_color == "black")
+#' filter(starwars, hair_color == "none" | eye_color == "black")
#'
#' # Multiple arguments are equivalent to and
-#' filter(mtcars, cyl < 6, vs == 1)
+#' filter(starwars, hair_color == "none", eye_color == "black")
filter <- function(.data, ...) {
- filter_(.data, .dots = lazyeval::lazy_dots(...))
+ UseMethod("filter")
+}
+#' @export
+filter.default <- function(.data, ...) {
+ filter_(.data, .dots = compat_as_lazy_dots(...))
}
-
#' @export
-#' @rdname filter
-filter_ <- function(.data, ..., .dots) {
+#' @rdname se-deprecated
+filter_ <- function(.data, ..., .dots = list()) {
UseMethod("filter_")
}
-#' Select rows by position.
+#' Select rows by position
#'
#' Slice does not work with relational databases because they have no
#' intrinsic notion of row order. If you want to perform the equivalent
-#' operation, use \code{\link{filter}()} and \code{\link{row_number}()}.
+#' operation, use [filter()] and [row_number()].
#'
#' @family single table verbs
-#' @param .data A tbl. All main verbs are S3 generics and provide methods
-#' for \code{\link{tbl_df}}, \code{\link[dtplyr]{tbl_dt}} and \code{\link{tbl_sql}}.
-#' @param ... Integer row values
+#' @param .data A tbl.
+#' @param ... Integer row values.
+#'
+#' These arguments are automatically [quoted][rlang::quo] and
+#' [evaluated][rlang::eval_tidy] in the context of the data
+#' frame. They support [unquoting][rlang::quasiquotation] and
+#' splicing. See `vignette("programming")` for an introduction to
+#' these concepts.
#' @inheritParams filter
+#' @inheritSection filter Tidy data
#' @export
#' @examples
#' slice(mtcars, 1L)
@@ -58,16 +94,31 @@ filter_ <- function(.data, ..., .dots) {
#' filter(mtcars, row_number() == n())
#' filter(mtcars, between(row_number(), 5, n()))
slice <- function(.data, ...) {
- slice_(.data, .dots = lazyeval::lazy_dots(...))
+ UseMethod("slice")
}
-
#' @export
-#' @rdname slice
-slice_ <- function(.data, ..., .dots) {
+slice.default <- function(.data, ...) {
+ slice_(.data, .dots = compat_as_lazy_dots(...))
+}
+#' @export
+#' @rdname se-deprecated
+slice_ <- function(.data, ..., .dots = list()) {
UseMethod("slice_")
}
-#' Summarise multiple values to a single value.
+#' Reduces multiple values down to a single value
+#'
+#' `summarise()` is typically used on grouped data created by [group_by()].
+#' The output will have one row for each group.
+#'
+#' @section Useful functions:
+#'
+#' * Center: [mean()], [median()]
+#' * Spread: [sd()], [IQR()], [mad()]
+#' * Range: [min()], [max()], [quantile()]
+#' * Position: [first()], [last()], [nth()],
+#' * Count: [n()], [n_distinct()]
+#' * Logical: [any()], [all()]
#'
#' @section Backend variations:
#'
@@ -76,255 +127,349 @@ slice_ <- function(.data, ..., .dots) {
#'
#' @export
#' @inheritParams filter
-#' @param ... Name-value pairs of summary functions like \code{\link{min}()},
-#' \code{\link{mean}()}, \code{\link{max}()} etc.
+#' @inheritSection filter Tidy data
+#' @param ... Name-value pairs of summary functions. The name will be the
+#' name of the variable in the result. The value should be an expression
+#' that returns a single value like `min(x)`, `n()`, or `sum(is.na(y))`.
+#'
+#' These arguments are automatically [quoted][rlang::quo] and
+#' [evaluated][rlang::eval_tidy] in the context of the data
+#' frame. They support [unquoting][rlang::quasiquotation] and
+#' splicing. See `vignette("programming")` for an introduction to
+#' these concepts.
#' @family single table verbs
-#' @return An object of the same class as \code{.data}. One grouping level will
+#' @return An object of the same class as `.data`. One grouping level will
#' be dropped.
-#'
-#' Data frame row names are silently dropped. To preserve, convert to an
-#' explicit variable.
#' @examples
-#' summarise(mtcars, mean(disp))
-#' summarise(group_by(mtcars, cyl), mean(disp))
-#' summarise(group_by(mtcars, cyl), m = mean(disp), sd = sd(disp))
-#'
-#' # With data frames, you can create and immediately use summaries
-#' by_cyl <- mtcars %>% group_by(cyl)
-#' by_cyl %>% summarise(a = n(), b = a + 1)
-#'
-#' \dontrun{
-#' # You can't with data tables or databases
-#' by_cyl_dt <- mtcars %>% dtplyr::tbl_dt() %>% group_by(cyl)
-#' by_cyl_dt %>% summarise(a = n(), b = a + 1)
-#'
-#' by_cyl_db <- src_sqlite(":memory:", create = TRUE) %>%
-#' copy_to(mtcars) %>% group_by(cyl)
-#' by_cyl_db %>% summarise(a = n(), b = a + 1)
-#' }
+#' # A summary applied to ungrouped tbl returns a single row
+#' mtcars %>%
+#' summarise(mean = mean(disp), n = n())
+#'
+#' # Usually, you'll want to group first
+#' mtcars %>%
+#' group_by(cyl) %>%
+#' summarise(mean = mean(disp), n = n())
+#'
+#' # Each summary call removes one grouping level (since that group
+#' # is now just a single row)
+#' mtcars %>%
+#' group_by(cyl, vs) %>%
+#' summarise(cyl_n = n()) %>%
+#' group_vars()
+#'
+#' # Note that with data frames, newly created summaries immediately
+#' # overwrite existing variables
+#' mtcars %>%
+#' group_by(cyl) %>%
+#' summarise(disp = mean(disp), sd = sd(disp))
+#'
+#'
+#' # summarise() supports quasiquotation. You can unquote raw
+#' # expressions or quosures:
+#' var <- quo(mean(cyl))
+#' summarise(mtcars, !! var)
summarise <- function(.data, ...) {
- summarise_(.data, .dots = lazyeval::lazy_dots(...))
+ UseMethod("summarise")
}
-
#' @export
-#' @rdname summarise
-summarise_ <- function(.data, ..., .dots) {
+summarise.default <- function(.data, ...) {
+ summarise_(.data, .dots = compat_as_lazy_dots(...))
+}
+#' @export
+#' @rdname se-deprecated
+summarise_ <- function(.data, ..., .dots = list()) {
UseMethod("summarise_")
}
#' @rdname summarise
#' @export
summarize <- summarise
-
-#' @rdname summarise
+#' @rdname se-deprecated
#' @export
summarize_ <- summarise_
-#' Add new variables.
+#' Add new variables
+#'
+#' `mutate()` adds new variables and preserves existing;
+#' `transmute()` drops existing variables.
+#'
+#' @section Useful functions:
+#'
+#' * [`+`], [`-`] etc
+#'
+#' * [log()]
+#'
+#' * [lead()], [lag()]
+#'
+#' * [dense_rank()], [min_rank()], [percent_rank()], [row_number()],
+#' [cume_dist()], [ntile()]
+#'
+#' * [cumsum()], [cummean()], [cummin()], [cummax()], [cumany()], [cumall()]
#'
-#' Mutate adds new variables and preserves existing; transmute drops existing
-#' variables.
+#' * [na_if()], [coalesce()]
+#'
+#' * [if_else()], [recode()], [case_when()]
+#'
+#' @section Scoped mutation and transmutation:
+#'
+#' The three [scoped] variants of `mutate()` ([mutate_all()],
+#' [mutate_if()] and [mutate_at()]) and the three variants of
+#' `transmute()` ([transmute_all()], [transmute_if()],
+#' [transmute_at()]) make it easy to apply a transformation to a
+#' selection of variables.
#'
#' @export
#' @inheritParams filter
-#' @param ... Name-value pairs of expressions. Use \code{NULL} to drop
+#' @inheritSection filter Tidy data
+#' @param ... Name-value pairs of expressions. Use `NULL` to drop
#' a variable.
-#' @family single table verbs
-#' @return An object of the same class as \code{.data}.
#'
-#' Data frame row names are silently dropped. To preserve, convert to an
-#' explicit variable.
+#' These arguments are automatically [quoted][rlang::quo] and
+#' [evaluated][rlang::eval_tidy] in the context of the data
+#' frame. They support [unquoting][rlang::quasiquotation] and
+#' splicing. See `vignette("programming")` for an introduction to
+#' these concepts.
+#' @family single table verbs
+#' @return An object of the same class as `.data`.
#' @examples
-#' mutate(mtcars, displ_l = disp / 61.0237)
-#' transmute(mtcars, displ_l = disp / 61.0237)
-#'
-#' mutate(mtcars, cyl = NULL)
+#' # Newly created variables are available immediately
+#' mtcars %>% as_tibble() %>% mutate(
+#' cyl2 = cyl * 2,
+#' cyl4 = cyl2 * 2
+#' )
+#'
+#' # You can also use mutate() to remove variables and
+#' # modify existing variables
+#' mtcars %>% as_tibble() %>% mutate(
+#' mpg = NULL,
+#' disp = disp * 0.0163871 # convert to litres
+#' )
+#'
+#'
+#' # window functions are useful for grouped mutates
+#' mtcars %>%
+#' group_by(cyl) %>%
+#' mutate(rank = min_rank(desc(mpg)))
+#' # see `vignette("window-functions")` for more details
+#'
+#' # You can drop variables by setting them to NULL
+#' mtcars %>% mutate(cyl = NULL)
+#'
+#' # mutate() vs transmute --------------------------
+#' # mutate() keeps all existing variables
+#' mtcars %>%
+#' mutate(displ_l = disp / 61.0237)
+#'
+#' # transmute keeps only the variables you create
+#' mtcars %>%
+#' transmute(displ_l = disp / 61.0237)
+#'
+#'
+#' # mutate() supports quasiquotation. You can unquote quosures, which
+#' # can refer to both contextual variables and variable names:
+#' var <- 100
+#' as_tibble(mtcars) %>% mutate(cyl = !! quo(cyl * var))
mutate <- function(.data, ...) {
- mutate_(.data, .dots = lazyeval::lazy_dots(...))
+ UseMethod("mutate")
}
-
#' @export
-#' @rdname mutate
-mutate_ <- function(.data, ..., .dots) {
+mutate.default <- function(.data, ...) {
+ mutate_(.data, .dots = compat_as_lazy_dots(...))
+}
+#' @export
+#' @rdname se-deprecated
+mutate_ <- function(.data, ..., .dots = list()) {
UseMethod("mutate_")
}
#' @rdname mutate
#' @export
transmute <- function(.data, ...) {
- transmute_(.data, .dots = lazyeval::lazy_dots(...))
+ UseMethod("transmute")
}
-
-#' @rdname mutate
+#' @rdname se-deprecated
#' @export
-transmute_ <- function(.data, ..., .dots) {
+transmute_ <- function(.data, ..., .dots = list()) {
UseMethod("transmute_")
}
-
#' @export
-transmute_.default <- function(.data, ..., .dots) {
- dots <- lazyeval::all_dots(.dots, ..., all_named = TRUE)
- out <- mutate_(.data, .dots = dots)
+transmute.default <- function(.data, ...) {
+ dots <- named_quos(...)
+ out <- mutate(.data, !!! dots)
keep <- names(dots)
select(out, one_of(keep))
}
+#' @export
+transmute_.default <- function(.data, ..., .dots = list()) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
+ transmute(.data, !!! dots)
+}
-#' Arrange rows by variables.
+#' Arrange rows by variables
#'
-#' Use \code{\link{desc}} to sort a variable in descending order. Generally,
-#' this will not also automatically order by grouping variables.
+#' Use [desc()] to sort a variable in descending order.
#'
#' @section Locales:
-#'
-#' Note that for local data frames, the ordering is done in C++ code which
-#' does not have access to the local specific ordering usually done in R.
-#' This means that strings are ordered as if in the C locale.
+#' The sort order for character vectors will depend on the collating sequence
+#' of the locale in use: see [locales()].
#'
#' @export
#' @inheritParams filter
+#' @inheritSection filter Tidy data
#' @param ... Comma separated list of unquoted variable names. Use
-#' \code{\link{desc}} to sort a variable in descending order.
+#' [desc()] to sort a variable in descending order.
#' @family single table verbs
-#' @return An object of the same class as \code{.data}.
-#'
-#' Data frame row names are silently dropped. To preserve, convert to an
-#' explicit variable.
+#' @return An object of the same class as `.data`.
#' @examples
#' arrange(mtcars, cyl, disp)
#' arrange(mtcars, desc(disp))
+#'
+#' # grouped arrange ignores groups
+#' by_cyl <- mtcars %>% group_by(cyl)
+#' by_cyl %>% arrange(desc(wt))
+#' # Unless you specifically ask:
+#' by_cyl %>% arrange(desc(wt), .by_group = TRUE)
arrange <- function(.data, ...) {
- arrange_(.data, .dots = lazyeval::lazy_dots(...))
+ UseMethod("arrange")
+}
+#' @export
+arrange.default <- function(.data, ...) {
+ arrange_(.data, .dots = compat_as_lazy_dots(...))
+}
+#' @export
+#' @rdname se-deprecated
+arrange_ <- function(.data, ..., .dots = list()) {
+ UseMethod("arrange_")
}
#' @export
#' @rdname arrange
-arrange_ <- function(.data, ..., .dots) {
- UseMethod("arrange_")
+#' @param .by_group If `TRUE`, will sort first by grouping variable. Applies to
+#' grouped data frames only.
+arrange.grouped_df <- function(.data, ..., .by_group = FALSE) {
+ if (.by_group) {
+ dots <- quos(!!!groups(.data), ...)
+ } else {
+ dots <- quos(...)
+ }
+
+ arrange_impl(.data, dots)
}
-#' Select/rename variables by name.
+#' Select/rename variables by name
#'
-#' \code{select()} keeps only the variables you mention; \code{rename()}
+#' `select()` keeps only the variables you mention; `rename()`
#' keeps all variables.
#'
-#' @section Special functions:
-#' As well as using existing functions like \code{:} and \code{c}, there are
-#' a number of special functions that only work inside \code{select}
+#' @section Useful functions:
+#' As well as using existing functions like `:` and `c()`, there are
+#' a number of special functions that only work inside `select`
#'
-
+#' * [starts_with()], [ends_with()], [contains()]
+#' * [matches()]
+#' * [num_range()]
+#'
+#' To drop variables, use `-`.
#'
-#' To drop variables, use \code{-}. You can rename variables with
-#' named arguments.
+#' Note that except for `:`, `-` and `c()`, all complex expressions
+#' are evaluated outside the data frame context. This is to prevent
+#' accidental matching of data frame variables when you refer to
+#' variables from the calling context.
+#'
+#' @section Scoped selection and renaming:
+#'
+#' The three [scoped] variants of `select()` ([select_all()],
+#' [select_if()] and [select_at()]) and the three variants of
+#' `rename()` ([rename_all()], [rename_if()], [rename_at()]) make it
+#' easy to apply a renaming function to a selection of variables.
#'
#' @inheritParams filter
-#' @param ... Comma separated list of unquoted expressions. You can treat
-#' variable names like they are positions. Use positive values to select
-#' variables; use negative values to drop variables.
-#' @param .dots Use \code{select_()} to do standard evaluation. See
-#' \code{vignette("nse")} for details
-#' @return An object of the same class as \code{.data}.
-#'
-#' Data frame row names are silently dropped. To preserve, convert to an
-#' explicit variable.
+#' @inheritSection filter Tidy data
+#' @param ... One or more unquoted expressions separated by commas.
+#' You can treat variable names like they are positions.
+#'
+#' Positive values select variables; negative values to drop variables.
+#' If the first expression is negative, `select()` will automatically
+#' start with all variables.
+#'
+#' Use named arguments to rename selected variables.
+#'
+#' These arguments are automatically [quoted][rlang::quo] and
+#' [evaluated][rlang::eval_tidy] in a context where column names
+#' represent column positions. They support
+#' [unquoting][rlang::quasiquotation] and splicing. See
+#' `vignette("programming")` for an introduction to these concepts.
+#' @return An object of the same class as `.data`.
#' @family single table verbs
#' @export
#' @examples
-#' iris <- tbl_df(iris) # so it prints a little nicer
+#' iris <- as_tibble(iris) # so it prints a little nicer
#' select(iris, starts_with("Petal"))
#' select(iris, ends_with("Width"))
-#' select(iris, contains("etal"))
-#' select(iris, matches(".t."))
-#' select(iris, Petal.Length, Petal.Width)
-#' vars <- c("Petal.Length", "Petal.Width")
-#' select(iris, one_of(vars))
+#'
+#' # Move Species variable to the front
+#' select(iris, Species, everything())
#'
#' df <- as.data.frame(matrix(runif(100), nrow = 10))
#' df <- tbl_df(df[c(3, 4, 7, 1, 9, 8, 5, 2, 6, 10)])
#' select(df, V4:V6)
#' select(df, num_range("V", 4:6))
#'
-#' # Drop variables
+#' # Drop variables with -
#' select(iris, -starts_with("Petal"))
-#' select(iris, -ends_with("Width"))
-#' select(iris, -contains("etal"))
-#' select(iris, -matches(".t."))
-#' select(iris, -Petal.Length, -Petal.Width)
#'
-#' # Rename variables:
+#'
+#' # The .data pronoun is available:
+#' select(mtcars, .data$cyl)
+#' select(mtcars, .data$mpg : .data$disp)
+#'
+#' # However it isn't available within calls since those are evaluated
+#' # outside of the data context. This would fail if run:
+#' # select(mtcars, identical(.data$cyl))
+#'
+#'
+#' # Renaming -----------------------------------------
#' # * select() keeps only the variables you specify
#' select(iris, petal_length = Petal.Length)
-#' # Renaming multiple variables uses a prefix:
-#' select(iris, petal = starts_with("Petal"))
-#'
-#' # Reorder variables: keep the variable "Species" in the front
-#' select(iris, Species, everything())
#'
#' # * rename() keeps all variables
#' rename(iris, petal_length = Petal.Length)
-#'
-#' # Programming with select ---------------------------------------------------
-#' select_(iris, ~Petal.Length)
-#' select_(iris, "Petal.Length")
-#' select_(iris, lazyeval::interp(~matches(x), x = ".t."))
-#' select_(iris, quote(-Petal.Length), quote(-Petal.Width))
-#' select_(iris, .dots = list(quote(-Petal.Length), quote(-Petal.Width)))
select <- function(.data, ...) {
- select_(.data, .dots = lazyeval::lazy_dots(...))
+ UseMethod("select")
}
-
#' @export
-#' @rdname select
-select_ <- function(.data, ..., .dots) {
- UseMethod("select_")
+select.default <- function(.data, ...) {
+ select_(.data, .dots = compat_as_lazy_dots(...))
}
-
-#' Select columns using a predicate
-#'
-#' This verb is analogous to \code{\link{summarise_if}()} and
-#' \code{\link{mutate_if}()} in that it lets you use a predicate on
-#' the columns of a data frame. Only those columns for which the
-#' predicate returns \code{TRUE} will be selected.
-#'
-#' Predicates can only be used with local sources like a data frame.
-#'
-#' @inheritParams summarise_all
-#' @param .data A local tbl source.
-#' @param ... Additional arguments passed to \code{.predicate}.
#' @export
-#' @examples
-#' iris %>% select_if(is.factor)
-#' iris %>% select_if(is.numeric)
-#' iris %>% select_if(function(col) is.numeric(col) && mean(col) > 3.5)
-select_if <- function(.data, .predicate, ...) {
- if (inherits(.data, "tbl_lazy")) {
- stop("Selection with predicate currently require local sources",
- call. = FALSE)
- }
- vars <- probe_colwise_names(.data, .predicate, ...)
- vars <- ensure_grouped_vars(vars, .data, notify = FALSE)
- select_(.data, .dots = vars)
+#' @rdname se-deprecated
+select_ <- function(.data, ..., .dots = list()) {
+ UseMethod("select_")
}
#' @rdname select
#' @export
rename <- function(.data, ...) {
- rename_(.data, .dots = lazyeval::lazy_dots(...))
+ UseMethod("rename")
}
-
-#' @rdname select
#' @export
-rename_ <- function(.data, ..., .dots) {
+rename.default <- function(.data, ...) {
+ rename_(.data, .dots = compat_as_lazy_dots(...))
+}
+#' @rdname se-deprecated
+#' @export
+rename_ <- function(.data, ..., .dots = list()) {
UseMethod("rename_")
}
#' The number of observations in the current group.
#'
-#' This function is implemented special for each data source and can only
-#' be used from within \code{\link{summarise}}, \code{\link{mutate}} and
-#' \code{\link{filter}}
+#' This function is implemented specifically for each data source and can only
+#' be used from within [summarise()], [mutate()] and
+#' [filter()].
#'
#' @export
#' @examples
@@ -335,5 +480,30 @@ rename_ <- function(.data, ..., .dots) {
#' filter(carriers, n() < 100)
#' }
n <- function() {
- stop("This function should not be called directly")
+ abort("This function should not be called directly")
}
+
+
+#' Deprecated SE versions of main verbs.
+#'
+#' dplyr used to offer twin versions of each verb suffixed with an
+#' underscore. These versions had standard evaluation (SE) semantics:
+#' rather than taking arguments by code, like NSE verbs, they took
+#' arguments by value. Their purpose was to make it possible to
+#' program with dplyr. However, dplyr now uses tidy evaluation
+#' semantics. NSE verbs still capture their arguments, but you can now
+#' unquote parts of these arguments. This offers full programmability
+#' with NSE verbs. Thus, the underscored versions are now superfluous.
+#'
+#' Unquoting triggers immediate evaluation of its operand and inlines
+#' the result within the captured expression. This result can be a
+#' value or an expression to be evaluated later with the rest of the
+#' argument. See `vignette("programming")` for more information.
+#'
+#' @name se-deprecated
+#' @param .data A data frame.
+#' @param dots,.dots,... Pair/values of expressions coercible to lazy objects.
+#' @param vars Various meanings depending on the verb.
+#' @param args Various meanings depending on the verb.
+#' @keywords internal
+NULL
diff --git a/R/na_if.R b/R/na_if.R
index da81a79..86434a8 100644
--- a/R/na_if.R
+++ b/R/na_if.R
@@ -1,13 +1,13 @@
-#' Convert values to NA.
+#' Convert values to NA
#'
-#' This is a translation of the SQL command \code{NULL_IF}. It is useful
-#' if you want to convert an annoying value to \code{NA}.
+#' This is a translation of the SQL command `NULL_IF`. It is useful
+#' if you want to convert an annoying value to `NA`.
#'
#' @param x Vector to modify
-#' @param y If th
-#' @return A modified version of \code{x} that replaces any values that
-#' are equal to \code{y} with NA.
-#' @seealso \code{\link{coalesce}()} to replace missing values with a specified
+#' @param y Value to replace with NA
+#' @return A modified version of `x` that replaces any values that
+#' are equal to `y` with NA.
+#' @seealso [coalesce()] to replace missing values with a specified
#' value.
#' @export
#' @examples
@@ -20,9 +20,7 @@
#' y <- c("abc", "def", "", "ghi")
#' na_if(y, "")
na_if <- function(x, y) {
- if (length(y) != length(x) && length(y) != 1) {
- stop("`y` must be length 1 or same length as `x`", call. = FALSE)
- }
+ check_length(y, x, fmt_args("y"), glue("same as {fmt_args(~x)}"))
x[x == y] <- NA
x
diff --git a/R/near.R b/R/near.R
index a8093a7..d28021b 100644
--- a/R/near.R
+++ b/R/near.R
@@ -1,7 +1,7 @@
-#' Compare two numeric vectors.
+#' Compare two numeric vectors
#'
#' This is a safe way of comparing if two vectors of floating point numbers
-#' are (pairwise) equal. This is safer than using \code{==}, because it has
+#' are (pairwise) equal. This is safer than using `==`, because it has
#' a built in tolerance
#'
#' @param x,y Numeric vectors to compare
diff --git a/R/nth-value.R b/R/nth-value.R
index bc4ce0c..705f6bb 100644
--- a/R/nth-value.R
+++ b/R/nth-value.R
@@ -1,4 +1,4 @@
-#' Extract the first, last or nth value from a vector.
+#' Extract the first, last or nth value from a vector
#'
#' These are straightforward wrappers around \code{\link{[[}}. The main
#' advantage is that you can provide an optional secondary vector that defines
@@ -6,30 +6,39 @@
#' than expected.
#'
#' @param x A vector
-#' @param n For \code{nth_value}, a single integer specifying the position.
-#' Negative integers index from the end (i.e. \code{-1L} will return the
+#' @param n For `nth_value()`, a single integer specifying the position.
+#' Negative integers index from the end (i.e. `-1L` will return the
#' last value in the vector).
#'
#' If a double is supplied, it will be silently truncated.
#' @param order_by An optional vector used to determine the order
#' @param default A default value to use if the position does not exist in
-#' the input. This is guessed by default for atomic vectors, where a
-#' missing value of the appropriate type is return, and for lists, where
-#' a \code{NULL} is return. For more complicated objects, you'll need to
-#' supply this value.
-#' @return A single value. \code{[[} is used to do the subsetting.
+#' the input. This is guessed by default for base vectors, where a
+#' missing value of the appropriate type is returned, and for lists, where
+#' a `NULL` is return.
+#'
+#' For more complicated objects, you'll need to supply this value.
+#' Make sure it is the same type as `x`.
+#' @return A single value. `[[` is used to do the subsetting.
#' @export
#' @examples
#' x <- 1:10
#' y <- 10:1
#'
+#' first(x)
+#' last(y)
+#'
#' nth(x, 1)
#' nth(x, 5)
#' nth(x, -2)
#' nth(x, 11)
#'
#' last(x)
+#' # Second argument provides optional ordering
#' last(x, y)
+#'
+#' # These functions always return a single value
+#' first(integer())
nth <- function(x, n, order_by = NULL, default = default_missing(x)) {
stopifnot(length(n) == 1, is.numeric(n))
n <- trunc(n)
@@ -46,7 +55,7 @@ nth <- function(x, n, order_by = NULL, default = default_missing(x)) {
if (is.null(order_by)) {
x[[n]]
} else {
- x[[order(order_by)[n]]]
+ x[[ order(order_by)[[n]] ]]
}
}
@@ -63,19 +72,19 @@ last <- function(x, order_by = NULL, default = default_missing(x)) {
}
default_missing <- function(x) {
- # The user needs to supply a default for anything with attributes
- if (!is.vector(x)) {
- stop("Don't know how to generate default for object of class ",
- paste0(class(x), collapse = "/"), call. = FALSE)
- }
+ UseMethod("default_missing")
+}
- if (is.list(x)) {
+#' @export
+default_missing.default <- function(x) {
+ if (!is.object(x) && is.list(x)) {
NULL
- } else if (is.vector(x) && is.atomic(x)) {
- def <- NA
- storage.mode(def) <- storage.mode(x)
- def
+ } else {
+ x[NA_real_]
}
}
-
+#' @export
+default_missing.data.frame <- function(x) {
+ rep(NA, nrow(x))
+}
diff --git a/R/order-by.R b/R/order-by.R
index c983b5e..53bf9d2 100644
--- a/R/order-by.R
+++ b/R/order-by.R
@@ -1,12 +1,12 @@
-#' A helper function for ordering window function output.
-#'
-#' This is a useful function to control the order of window functions in
-#' R that don't have a specific ordering parameter. When translated to SQL
-#' it will modify the order clause of the OVER function.
-#'
-#' This function works by changing the \code{call} to instead call
-#' \code{\link{with_order}} with the appropriate arguments.
-#'
+#' A helper function for ordering window function output
+#'
+#' This function makes it possible to control the ordering of window functions
+#' in R that don't have a specific ordering parameter. When translated to SQL
+#' it will modify the order clause of the OVER function.
+#'
+#' This function works by changing the `call` to instead call
+#' [with_order()] with the appropriate arguments.
+#'
#' @param order_by a vector to order_by
#' @param call a function call to a window function, where the first argument
#' is the vector being operated on
@@ -16,40 +16,43 @@
#' x <- 10:1
#' y <- 1:10
#' order_by(x, cumsum(y))
-#'
+#'
#' df <- data.frame(year = 2000:2005, value = (0:5) ^ 2)
#' scrambled <- df[sample(nrow(df)), ]
-#'
+#'
#' wrong <- mutate(scrambled, running = cumsum(value))
#' arrange(wrong, year)
-#'
+#'
#' right <- mutate(scrambled, running = order_by(year, cumsum(value)))
#' arrange(right, year)
order_by <- function(order_by, call) {
- call <- substitute(call)
- stopifnot(is.call(call))
-
- new_call <- as.call(c(
- quote(with_order),
- list(substitute(order_by)),
- as.list(call)
- ))
- eval(new_call, parent.frame())
+ quo <- enquo(call)
+ if (!quo_is_lang(quo)) {
+ type <- friendly_type(type_of(get_expr(quo)))
+ bad_args("call", "must be a function call, not { type }")
+ }
+
+ fn <- set_expr(quo, node_car(get_expr(quo)))
+ args <- node_cdr(get_expr(quo))
+ args <- map(args, new_quosure, f_env(quo))
+
+ quo <- quo(with_order(!! order_by, !! fn, !!! args))
+ eval_tidy(quo)
}
#' Run a function with one order, translating result back to original order
-#'
+#'
#' This is used to power the ordering parameters of dplyr's window functions
-#'
+#'
#' @param order_by vector to order by
#' @param fun window function
-#' @param x,... arguments to \code{f}
+#' @param x,... arguments to `f`
#' @keywords internal
#' @export
with_order <- function(order_by, fun, x, ...) {
ord <- order(order_by)
undo <- match(seq_along(order_by), ord)
-
+
out <- fun(x[ord], ...)
out[undo]
}
diff --git a/R/over.R b/R/over.R
deleted file mode 100644
index 93d9625..0000000
--- a/R/over.R
+++ /dev/null
@@ -1,59 +0,0 @@
-# Generate SQL expression for window function
-# over("avg(x)", frame = c(-Inf, 0))
-# over("avg(x)", order = "y")
-# over("avg(x)", order = c("x", "y"))
-# over("avg(x)")
-over <- function(expr, partition = NULL, order = NULL, frame = NULL) {
- if (length(partition) == 0) {
- partition <- NULL
- }
- if (!is.null(partition)) {
- if (!is.sql(partition)) {
- partition <- ident(partition)
- }
-
- partition <- build_sql("PARTITION BY ",
- sql_vector(escape(partition, con = partition_con()), collapse = ", ", parens = FALSE))
- }
- if (!is.null(order)) {
- if (!is.sql(order)) {
- order <- ident(order)
- }
-
- order <- build_sql("ORDER BY ", sql_vector(escape(order), collapse = ", ", parens = FALSE))
- }
- if (!is.null(frame)) {
- if (is.null(order)) {
- warning(
- "Windowed expression '", expr, "' does not have explicit order.\n",
- "Please use arrange() to make determinstic.",
- call. = FALSE
- )
- }
-
- if (is.numeric(frame)) frame <- rows(frame[1], frame[2])
- frame <- build_sql("ROWS ", frame)
- }
-
- over <- sql_vector(compact(list(partition, order, frame)), parens = TRUE)
- sql <- build_sql(expr, " OVER ", over)
-
- sql
-}
-
-rows <- function(from = -Inf, to = 0) {
- if (from >= to) stop("from must be less than to", call. = FALSE)
-
- dir <- function(x) if (x < 0) "PRECEDING" else "FOLLOWING"
- val <- function(x) if (is.finite(x)) as.integer(abs(x)) else "UNBOUNDED"
- bound <- function(x) {
- if (x == 0) return("CURRENT ROW")
- paste(val(x), dir(x))
- }
-
- if (to == 0) {
- sql(bound(from))
- } else {
- sql(paste0("BETWEEN ", bound(from), " AND ", bound(to)))
- }
-}
diff --git a/R/partial-eval.r b/R/partial-eval.r
deleted file mode 100644
index adcbfd1..0000000
--- a/R/partial-eval.r
+++ /dev/null
@@ -1,124 +0,0 @@
-#' Partially evaluate an expression.
-#'
-#' This function partially evaluates an expression, using information from
-#' the tbl to determine whether names refer to local expressions
-#' or remote variables. This simplifies SQL translation because expressions
-#' don't need to carry around their environment - all revelant information
-#' is incorporated into the expression.
-#'
-#' @section Symbol substitution:
-#'
-#' \code{partial_eval} needs to guess if you're referring to a variable on the
-#' server (remote), or in the current environment (local). It's not possible to
-#' do this 100% perfectly. \code{partial_eval} uses the following heuristic:
-#'
-#' \itemize{
-#' \item If the tbl variables are known, and the symbol matches a tbl
-#' variable, then remote.
-#' \item If the symbol is defined locally, local.
-#' \item Otherwise, remote.
-#' }
-#'
-#' @param call an unevaluated expression, as produced by \code{\link{quote}}
-#' @param tbl a tbl object
-#' @param env environment in which to search for local values
-#' @export
-#' @keywords internal
-#' @examples
-#' if (require("Lahman")) {
-#' bdf <- tbl_df(Batting)
-#' partial_eval(quote(year > 1980), bdf)
-#'
-#' ids <- c("ansonca01", "forceda01", "mathebo01")
-#' partial_eval(quote(id %in% ids), bdf)
-#'
-#' # You can use local to disambiguate between local and remote
-#' # variables: otherwise remote is always preferred
-#' year <- 1980
-#' partial_eval(quote(year > year), bdf)
-#' partial_eval(quote(year > local(year)), bdf)
-#'
-#' # Functions are always assumed to be remote. Use local to force evaluation
-#' # in R.
-#' f <- function(x) x + 1
-#' partial_eval(quote(year > f(1980)), bdf)
-#' partial_eval(quote(year > local(f(1980))), bdf)
-#'
-#' # For testing you can also use it with the tbl omitted
-#' partial_eval(quote(1 + 2 * 3))
-#' x <- 1
-#' partial_eval(quote(x ^ y))
-#' }
-partial_eval <- function(call, tbl = NULL, env = parent.frame()) {
- if (is.atomic(call)) return(call)
-
- if (inherits(call, "lazy_dots")) {
- lapply(call, function(l) partial_eval(l$expr, tbl, l$env))
- } else if (is.list(call)) {
- lapply(call, partial_eval, tbl = tbl, env = env)
- } else if (is.symbol(call)) {
- name <- as.character(call)
- if (!is.null(tbl) && name %in% tbl_vars(tbl)) {
- call
- } else if (exists(name, env)) {
- eval(call, env)
- } else {
- call
- }
- } else if (is.call(call)) {
- # Process call arguments recursively, unless user has manually called
- # remote/local
- name <- as.character(call[[1]])
- if (name == "local") {
- eval(call[[2]], env)
- } else if (name %in% c("$", "[[", "[")) {
- # Subsetting is always done locally
- eval(call, env)
- } else if (name == "remote") {
- call[[2]]
- } else {
- call[-1] <- lapply(call[-1], partial_eval, tbl = tbl, env = env)
- call
- }
- } else {
- stop("Unknown input type: ", class(call), call. = FALSE)
- }
-}
-
-
-partial_eval2 <- function(call, vars = character(), env = parent.frame()) {
- if (is.atomic(call)) return(call)
-
- if (inherits(call, "lazy_dots")) {
- lapply(call, function(l) partial_eval2(l$expr, vars, l$env))
- } else if (is.list(call)) {
- lapply(call, partial_eval2, vars, env = env)
- } else if (is.symbol(call)) {
- name <- as.character(call)
- if (name %in% vars) {
- call
- } else if (exists(name, env)) {
- eval(call, env)
- } else {
- call
- }
- } else if (is.call(call)) {
- # Process call arguments recursively, unless user has manually called
- # remote/local
- name <- as.character(call[[1]])
- if (name == "local") {
- eval(call[[2]], env)
- } else if (name %in% c("$", "[[", "[")) {
- # Subsetting is always done locally
- eval(call, env)
- } else if (name == "remote") {
- call[[2]]
- } else {
- call[-1] <- lapply(call[-1], partial_eval2, vars = vars, env = env)
- call
- }
- } else {
- stop("Unknown input type: ", class(call), call. = FALSE)
- }
-}
-
diff --git a/R/progress.R b/R/progress.R
index 5ccc78a..1057cd6 100644
--- a/R/progress.R
+++ b/R/progress.R
@@ -3,13 +3,13 @@
#' This reference class represents a text progress bar displayed estimated
#' time remaining. When finished, it displays the total duration. The
#' automatic progress bar can be disabled by setting option
-#' \code{dplyr.show_progress} to \code{FALSE}.
+#' `dplyr.show_progress` to `FALSE`.
#'
-#' @param n Total number of
-#' @param min_time Progress bar will wait until at least \code{min_time}
+#' @param n Total number of items
+#' @param min_time Progress bar will wait until at least `min_time`
#' seconds have elapsed before displaying any results.
-#' @return A ref class with methods \code{tick()}, \code{print()},
-#' \code{pause()}, and \code{stop()}.
+#' @return A ref class with methods `tick()`, `print()`,
+#' `pause()`, and `stop()`.
#' @keywords internal
#' @export
#' @examples
@@ -77,7 +77,7 @@ Progress <- R6::R6Class("Progress",
"Process one element"
if (self$stopped) return(self)
- if (self$i == self$n) stop("No more ticks")
+ if (self$i == self$n) abort("No more ticks")
self$i <- self$i + 1
self
},
@@ -91,9 +91,9 @@ Progress <- R6::R6Class("Progress",
},
print = function(...) {
- if(!isTRUE(getOption("dplyr.show_progress")) || # user sepecifies no progress
- !interactive() || # not an interactive session
- !is.null(getOption("knitr.in.progress"))) { # dplyr used within knitr document
+ if (!isTRUE(getOption("dplyr.show_progress")) || # user sepecifies no progress
+ !interactive() || # not an interactive session
+ !is.null(getOption("knitr.in.progress"))) { # dplyr used within knitr document
return(invisible(self))
}
diff --git a/R/pull.R b/R/pull.R
new file mode 100644
index 0000000..271c01a
--- /dev/null
+++ b/R/pull.R
@@ -0,0 +1,52 @@
+#' Pull out a single variable
+#'
+#' This works like `[[` for local data frames, and automatically collects
+#' before indexing for remote data tables.
+#'
+#' @param .data A table of data
+#' @inheritParams select_var
+#' @export
+#' @examples
+#' mtcars %>% pull(-1)
+#' mtcars %>% pull(1)
+#' mtcars %>% pull(cyl)
+#'
+#' # Also works for remote sources
+#' if (requireNamespace("dbplyr", quietly = TRUE)) {
+#' df <- dbplyr::memdb_frame(x = 1:10, y = 10:1, .name = "pull-ex")
+#' df %>%
+#' mutate(z = x * y) %>%
+#' pull()
+#' }
+#'
+pull <- function(.data, var = -1) {
+ UseMethod("pull")
+}
+#' @export
+pull.data.frame <- function(.data, var = -1) {
+ var <- select_var(names(.data), !! enquo(var))
+ .data[[var]]
+}
+
+# FIXME: remove this once dbplyr uses select_var()
+find_var <- function(expr, vars) {
+ var_env <- set_names(as.list(seq_along(vars)), vars)
+ var <- eval_tidy(expr, var_env)
+
+ if (!is.numeric(var) || length(var) != 1) {
+ bad_args("var", "must evaluate to a single number")
+ }
+
+ var <- as.integer(var)
+ n <- length(vars)
+
+ if (is.na(var) || abs(var) > n || var == 0L) {
+ bad_args("var", "must be a value between {-n} and {n} (excluding zero), not {var}")
+ }
+
+ if (var < 0) {
+ var <- var + n + 1
+ }
+
+ vars[[var]]
+}
diff --git a/R/query.r b/R/query.r
deleted file mode 100644
index 000fee8..0000000
--- a/R/query.r
+++ /dev/null
@@ -1,75 +0,0 @@
-#' Create a mutable query object.
-#'
-#' A query object is mutable wrapper around a \code{DBIResult} that caches
-#' expensive operations, and insulates the rest of dplyr from the vagaries of
-#' DBI and the individual database implementation.
-#'
-#' @keywords internal
-#' @param con a \code{DBOConnection}
-#' @param sql a string containing an sql query.
-#' @export
-query <- function(con, sql, .vars) UseMethod("query")
-
-#' @export
-query.DBIConnection <- function(con, sql, .vars) {
- assert_that(is.string(sql))
-
- Query$new(con, sql(sql), .vars)
-}
-
-Query <- R6::R6Class("Query",
- private = list(
- .nrow = NULL,
- .vars = NULL
- ),
- public = list(
- con = NULL,
- sql = NULL,
-
- initialize = function(con, sql, vars) {
- self$con <- con
- self$sql <- sql
- private$.vars <- vars
- },
-
- print = function(...) {
- cat("<Query> ", self$sql, "\n", sep = "")
- print(self$con)
- },
-
- fetch = function(n = -1L) {
- res <- dbSendQuery(self$con, self$sql)
- on.exit(dbClearResult(res))
-
- out <- fetch(res, n)
- res_warn_incomplete(res)
- out
- },
-
- fetch_paged = function(chunk_size = 1e4, callback) {
- qry <- dbSendQuery(self$con, self$sql)
- on.exit(dbClearResult(qry))
-
- while (!dbHasCompleted(qry)) {
- chunk <- fetch(qry, chunk_size)
- callback(chunk)
- }
-
- invisible(TRUE)
- },
-
- vars = function() {
- private$.vars
- },
-
- nrow = function() {
- if (!is.null(private$.nrow)) return(private$.nrow)
- private$.nrow <- db_query_rows(self$con, self$sql)
- private$.nrow
- },
-
- ncol = function() {
- length(self$vars())
- }
- )
-)
diff --git a/R/rank.R b/R/rank.R
index ade0ec1..d36a4fb 100644
--- a/R/rank.R
+++ b/R/rank.R
@@ -1,29 +1,26 @@
#' Windowed rank functions.
#'
-#' Six variations on ranking functions, mimicing the ranking functions
+#' Six variations on ranking functions, mimicking the ranking functions
#' described in SQL2003. They are currently implemented using the built in
-#' \code{rank} function, and are provided mainly as a convenience when
+#' `rank` function, and are provided mainly as a convenience when
#' converting between R and SQL. All ranking functions map smallest inputs
-#' to smallest outputs. Use \code{\link{desc}} to reverse the direction..
+#' to smallest outputs. Use [desc()] to reverse the direction.
#'
-#' \itemize{
-#' \item \code{row_number}: equivalent to \code{rank(ties.method = "first")}
+#' * `row_number()`: equivalent to `rank(ties.method = "first")`
#'
-#' \item \code{min_rank}: equivalent to \code{rank(ties.method = "min")}
+#' * `min_rank()`: equivalent to `rank(ties.method = "min")`
#'
-#' \item \code{dense_rank}: like \code{min_rank}, but with no gaps between
+#' * `dense_rank()`: like `min_rank()`, but with no gaps between
#' ranks
#'
-#' \item \code{percent_rank}: a number between 0 and 1 computed by
-#' rescaling \code{min_rank} to [0, 1]
+#' * `percent_rank()`: a number between 0 and 1 computed by
+#' rescaling `min_rank` to `[0, 1]`
#'
-#' \item \code{cume_dist}: a cumulative distribution function. Proportion
+#' * `cume_dist()`: a cumulative distribution function. Proportion
#' of all values less than or equal to the current rank.
#'
-#' \item \code{ntile}: a rough rank, which breaks the input vector into
-#' \code{n} buckets.
-#'
-#' }
+#' * `ntile()`: a rough rank, which breaks the input vector into
+#' `n` buckets.
#'
#' @name ranking
#' @param x a vector of values to rank. Missing values are left as is.
@@ -39,6 +36,11 @@
#'
#' ntile(x, 2)
#' ntile(runif(100), 10)
+#'
+#' # row_number can be used with single table verbs without specifying x
+#' # (for data frames and databases that support windowing)
+#' mutate(mtcars, row_number() == 1L)
+#' mtcars %>% filter(between(row_number(), 1, 10))
NULL
#' @export
@@ -51,7 +53,13 @@ row_number <- function(x) rank(x, ties.method = "first", na.last = "keep")
#' @export
#' @rdname ranking
ntile <- function(x, n) {
- floor((n * (row_number(x) - 1) / length(x)) + 1)
+ len <- sum(!is.na(x))
+
+ if (len == 0L) {
+ rep(NA_integer_, length(x))
+ } else {
+ as.integer(floor(n * (row_number(x) - 1)/len + 1))
+ }
}
#' @export
@@ -76,4 +84,3 @@ percent_rank <- function(x) {
cume_dist <- function(x) {
rank(x, ties.method = "max", na.last = "keep") / sum(!is.na(x))
}
-
diff --git a/R/rbind.R b/R/rbind.R
new file mode 100644
index 0000000..f34bc9d
--- /dev/null
+++ b/R/rbind.R
@@ -0,0 +1,71 @@
+list_or_dots <- function(...) {
+ dots <- dots_list(...)
+ if (!length(dots)) {
+ return(dots)
+ }
+
+ # Old versions specified that first argument could be a list of
+ # dataframeable objects
+ if (is_list(dots[[1]])) {
+ dots[[1]] <- map_if(dots[[1]], is_dataframe_like, as_tibble)
+ }
+
+ # Need to ensure that each component is a data frame or a vector
+ # wrapped in a list:
+ dots <- map_if(dots, is_dataframe_like, function(x) list(as_tibble(x)))
+ dots <- map_if(dots, is_atomic, list)
+ dots <- map_if(dots, is.data.frame, list)
+
+ unlist(dots, recursive = FALSE)
+}
+
+is_dataframe_like <- function(x) {
+ if (is_null(x))
+ return(FALSE)
+
+ # data frames are not data lists
+ if (is.data.frame(x))
+ return(FALSE)
+
+ # Must be a list
+ if (!is_list(x))
+ return(FALSE)
+
+ # 0 length named list (#1515)
+ if (!is_null(names(x)) && length(x) == 0)
+ return(TRUE)
+
+ # With names
+ if (!is_named(x))
+ return(FALSE)
+
+ # Where each element is an 1d vector or list
+ if (!every(x, is_1d))
+ return(FALSE)
+
+ # All of which have the same length
+ n <- map_int(x, length)
+ if (any(n != n[1]))
+ return(FALSE)
+
+ TRUE
+}
+
+
+# Deprecated functions ----------------------------------------------------
+
+#' @export
+#' @rdname bind
+#' @usage NULL
+rbind_list <- function(...) {
+ .Deprecated("bind_rows()")
+ tbl_df(bind_rows_(list_or_dots(...), id = NULL))
+}
+
+#' @export
+#' @rdname bind
+#' @usage NULL
+rbind_all <- function(x, id = NULL) {
+ .Deprecated("bind_rows()")
+ bind_rows_(x, id = id)
+}
diff --git a/R/recode.R b/R/recode.R
index 312c0cc..f43b614 100644
--- a/R/recode.R
+++ b/R/recode.R
@@ -1,32 +1,44 @@
#' Recode values
#'
-#' This is a vectorised version of \code{\link{switch}()}: you can replace
+#' This is a vectorised version of [switch()]: you can replace
#' numeric values based on their position, and character values by their
#' name. This is an S3 generic: dplyr provides methods for numeric, character,
-#' and factors. For logical vectors, use \code{\link{if_else}}
+#' and factors. For logical vectors, use [if_else()]. For more complicated
+#' criteria, use [case_when()].
+#'
+#' You can use `recode()` directly with factors; it will preserve the existing
+#' order of levels while changing the values. Alternatively, you can
+#' use `recode_factor()`, which will change the order of levels to match
+#' the order of replacements. See the [forcats](http://forcats.tidyverse.org/)
+#' package for more tools for working with factors and their levels.
#'
#' @param .x A vector to modify
-#' @param ... Replacments. These should be named for character and factor
-#' \code{.x}, and can be named for numeric \code{.x}.
+#' @param ... Replacements. These should be named for character and factor
+#' `.x`, and can be named for numeric `.x`. The argument names should be the
+#' current values to be replaced, and the argument values should be the new
+#' (replacement) values.
#'
#' All replacements must be the same type, and must have either
#' length one or the same length as x.
+#'
+#' These dots are evaluated with [explicit splicing][rlang::dots_list].
#' @param .default If supplied, all values not otherwise matched will
#' be given this value. If not supplied and if the replacements are
-#' the same type as the original values in \code{.x}, unmatched
+#' the same type as the original values in `.x`, unmatched
#' values are not changed. If not supplied and if the replacements
-#' are not compatible, unmatched values are replaced with \code{NA}.
-#' \code{.default} must be either length 1 or the same length as
-#' \code{.x}.
-#' @param .missing If supplied, any missing values in \code{.x} will be
+#' are not compatible, unmatched values are replaced with `NA`.
+#'
+#' `.default` must be either length 1 or the same length as
+#' `.x`.
+#' @param .missing If supplied, any missing values in `.x` will be
#' replaced by this value. Must be either length 1 or the same length as
-#' \code{.x}.
-#' @param .ordered If \code{TRUE}, \code{recode_factor()} creates an
+#' `.x`.
+#' @param .ordered If `TRUE`, `recode_factor()` creates an
#' ordered factor.
-#' @return A vector the same length as \code{.x}, and the same type as
-#' the first of \code{...}, \code{.default}, or \code{.missing}.
-#' \code{recode_factor()} returns a factor whose levels are in the
-#' same order as in \code{...}.
+#' @return A vector the same length as `.x`, and the same type as
+#' the first of `...`, `.default`, or `.missing`.
+#' `recode_factor()` returns a factor whose levels are in the
+#' same order as in `...`.
#' @export
#' @examples
#' # Recode values with named arguments
@@ -69,20 +81,19 @@ recode <- function(.x, ..., .default = NULL, .missing = NULL) {
#' @export
recode.numeric <- function(.x, ..., .default = NULL, .missing = NULL) {
- values <- list(...)
+ values <- dots_list(...)
- nms <- has_names(values)
+ nms <- have_name(values)
if (all(nms)) {
vals <- as.double(names(values))
} else if (all(!nms)) {
vals <- seq_along(values)
} else {
- stop("Either all values must be named, or none must be named.",
- call. = FALSE)
+ abort("Either all values must be named, or none must be named.")
}
n <- length(.x)
- template <- find_template(..., .default, .missing)
+ template <- find_template(values, .default, .missing)
out <- template[rep(NA_integer_, n)]
replaced <- rep(FALSE, n)
@@ -99,13 +110,14 @@ recode.numeric <- function(.x, ..., .default = NULL, .missing = NULL) {
#' @export
recode.character <- function(.x, ..., .default = NULL, .missing = NULL) {
- values <- list(...)
- if (!all(has_names(values))) {
- stop("All replacements must be named", call. = FALSE)
+ values <- dots_list(...)
+ if (!all(have_name(values))) {
+ bad <- which(!have_name(values)) + 1
+ bad_pos_args(bad, "must be named, not unnamed")
}
n <- length(.x)
- template <- find_template(..., .default, .missing)
+ template <- find_template(values, .default, .missing)
out <- template[rep(NA_integer_, n)]
replaced <- rep(FALSE, n)
@@ -122,38 +134,50 @@ recode.character <- function(.x, ..., .default = NULL, .missing = NULL) {
#' @export
recode.factor <- function(.x, ..., .default = NULL, .missing = NULL) {
- values <- list(...)
+ values <- dots_list(...)
if (length(values) == 0) {
- stop("No replacements provided", call. = FALSE)
+ abort("No replacements provided")
}
- if (!all(has_names(values))) {
- stop("All replacements must be named", call. = FALSE)
+ if (!all(have_name(values))) {
+ bad <- which(!have_name(values)) + 1
+ bad_pos_args(bad, "must be named, not unnamed")
}
if (!is.null(.missing)) {
- stop("`missing` is not supported for factors", call. = FALSE)
+ bad_args(".missing", "is not supported for factors")
}
- out <- rep(NA_character_, length(levels(.x)))
- replaced <- rep(FALSE, length(levels(.x)))
+ n <- length(levels(.x))
+ template <- find_template(values, .default, .missing)
+ out <- template[rep(NA_integer_, n)]
+ replaced <- rep(FALSE, n)
for (nm in names(values)) {
- out <- replace_with(out, levels(.x) == nm, values[[nm]], paste0("`", nm, "`"))
+ out <- replace_with(
+ out,
+ levels(.x) == nm,
+ values[[nm]],
+ paste0("`", nm, "`")
+ )
replaced[levels(.x) == nm] <- TRUE
}
-
.default <- validate_recode_default(.default, .x, out, replaced)
out <- replace_with(out, !replaced, .default, "`.default`")
- levels(.x) <- out
- .x
+ if (is.character(out)) {
+ levels(.x) <- out
+ .x
+ } else {
+ out[as.integer(.x)]
+ }
+
}
-find_template <- function(...) {
- x <- compact(list(...))
+find_template <- function(values, .default = NULL, .missing = NULL) {
+ x <- compact(c(values, .default, .missing))
if (length(x) == 0) {
- stop("No replacements provided", call. = FALSE)
+ abort("No replacements provided")
}
x[[1]]
@@ -163,9 +187,11 @@ validate_recode_default <- function(default, x, out, replaced) {
default <- recode_default(x, default, out)
if (is.null(default) && sum(replaced & !is.na(x)) < length(out[!is.na(x)])) {
- warning("Unreplaced values treated as NA as .x is not compatible. ",
+ warning(
+ "Unreplaced values treated as NA as .x is not compatible. ",
"Please specify replacements exhaustively or supply .default",
- call. = FALSE)
+ call. = FALSE
+ )
}
default
@@ -185,8 +211,12 @@ recode_default.default <- function(x, default, out) {
}
recode_default.factor <- function(x, default, out) {
- if (is.null(default) && is.factor(x)) {
- levels(x)
+ if (is.null(default)) {
+ if ((is.character(out) || is.factor(out)) && is.factor(x)) {
+ levels(x)
+ } else {
+ out[NA_integer_]
+ }
} else {
default
}
@@ -194,8 +224,8 @@ recode_default.factor <- function(x, default, out) {
#' @rdname recode
#' @export
-recode_factor <- function (.x, ..., .default = NULL, .missing = NULL,
- .ordered = FALSE) {
+recode_factor <- function(.x, ..., .default = NULL, .missing = NULL,
+ .ordered = FALSE) {
recoded <- recode(.x, ..., .default = .default, .missing = .missing)
all_levels <- unique(c(..., recode_default(.x, .default, recoded), .missing))
diff --git a/R/reexport-rlang.R b/R/reexport-rlang.R
new file mode 100644
index 0000000..641e951
--- /dev/null
+++ b/R/reexport-rlang.R
@@ -0,0 +1,11 @@
+#' @export
+rlang::quo
+
+#' @export
+rlang::quos
+
+#' @export
+rlang::enquo
+
+#' @export
+rlang::quo_name
diff --git a/R/tibble-reexport.r b/R/reexport-tibble.r
similarity index 85%
rename from R/tibble-reexport.r
rename to R/reexport-tibble.r
index 64d2e95..3f35166 100644
--- a/R/tibble-reexport.r
+++ b/R/reexport-tibble.r
@@ -42,12 +42,24 @@ tibble::glimpse
#' @export
tibble::frame_data
+#' @importFrom tibble tribble
+#' @export
+tibble::tribble
+
#' @importFrom tibble tibble
#' @export
tibble::tibble
+#' @importFrom tibble as_tibble
+#' @export
+tibble::as_tibble
+
# utils -------------------------------------------------------------------
#' @importFrom tibble trunc_mat
#' @export
tibble::trunc_mat
+
+#' @importFrom tibble tbl_sum
+#' @export
+tibble::tbl_sum
diff --git a/R/rowwise.r b/R/rowwise.r
index 9254940..5a4b0e9 100644
--- a/R/rowwise.r
+++ b/R/rowwise.r
@@ -1,14 +1,14 @@
#' Group input by rows
#'
-#' \code{rowwise} is used for the results of \code{\link{do}} when you
+#' `rowwise()` is used for the results of [do()] when you
#' create list-variables. It is also useful to support arbitrary
#' complex operations that need to be applied to each row.
#'
-#' Currently \code{rowwise} grouping only works with data frames. Its
+#' Currently, rowwise grouping only works with data frames. Its
#' main impact is to allow you to work with list-variables in
-#' \code{\link{summarise}} and \code{\link{mutate}} without having to
-#' use \code{[[1]]}. This makes \code{summarise()} on a rowwise tbl
-#' effectively equivalent to plyr's \code{ldply}.
+#' [summarise()] and [mutate()] without having to
+#' use \code{[[1]]}. This makes `summarise()` on a rowwise tbl
+#' effectively equivalent to [plyr::ldply()].
#'
#' @param data Input data frame.
#' @export
@@ -23,6 +23,8 @@ rowwise <- function(data) {
structure(data, class = c("rowwise_df", "tbl_df", "tbl", "data.frame"))
}
+setOldClass(c("rowwise_df", "tbl_df", "tbl", "data.frame"))
+
#' @export
print.rowwise_df <- function(x, ..., n = NULL, width = NULL) {
cat("Source: local data frame ", dim_desc(x), "\n", sep = "")
@@ -34,7 +36,7 @@ print.rowwise_df <- function(x, ..., n = NULL, width = NULL) {
#' @export
ungroup.rowwise_df <- function(x, ...) {
- class(x) <- c( "tbl_df", "data.frame")
+ class(x) <- c("tbl_df", "tbl", "data.frame")
x
}
#' @export
@@ -54,33 +56,40 @@ n_groups.rowwise_df <- function(x) {
}
#' @export
-group_by_.rowwise_df <- function(.data, ..., .dots, add = FALSE) {
- warning("Grouping rowwise data frame strips rowwise nature", call. = FALSE)
+group_by.rowwise_df <- function(.data, ..., add = FALSE) {
+ warn("Grouping rowwise data frame strips rowwise nature")
.data <- ungroup(.data)
- groups <- group_by_prepare(.data, ..., .dots = .dots, add = add)
- grouped_df(groups$data, groups$groups)
+ groups <- group_by_prepare(.data, ..., add = add)
+ grouped_df(groups$data, groups$group_names)
+}
+#' @export
+group_by_.rowwise_df <- function(.data, ..., .dots = list(), add = FALSE) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
+ group_by(.data, !!! dots, add = add)
}
# Do ---------------------------------------------------------------------------
#' @export
-do_.rowwise_df <- function(.data, ..., .dots) {
+do.rowwise_df <- function(.data, ...) {
# Create ungroup version of data frame suitable for subsetting
group_data <- ungroup(.data)
+ index <- attr(.data, "indices")
- args <- lazyeval::all_dots(.dots, ...)
+ args <- quos(...)
named <- named_args(args)
- env <- new.env(parent = lazyeval::common_env(args))
- index <- attr(.data, "indices")
# Create new environment, inheriting from parent, with an active binding
# for . that resolves to the current subset. `_i` is found in environment
# of this function because of usual scoping rules.
- makeActiveBinding(".", function() {
- lapply(group_data[`_i`, , drop = FALSE], "[[", 1)
- }, env)
+ env <- child_env(NULL)
+ current_row <- function() lapply(group_data[`_i`, , drop = FALSE], "[[", 1)
+ env_bind_fns(.env = env, . = current_row, .data = current_row)
+
+ overscope <- new_overscope(env)
+ on.exit(overscope_clean(overscope))
n <- nrow(.data)
m <- length(args)
@@ -91,7 +100,7 @@ do_.rowwise_df <- function(.data, ..., .dots) {
for (`_i` in seq_len(n)) {
for (j in seq_len(m)) {
- out[[j]][`_i`] <- list(eval(args[[j]]$expr, envir = env))
+ out[[j]][`_i`] <- list(overscope_eval_next(overscope, args[[j]]))
p$tick()$print()
}
}
@@ -102,3 +111,8 @@ do_.rowwise_df <- function(.data, ..., .dots) {
label_output_list(NULL, out, groups(.data))
}
}
+#' @export
+do_.rowwise_df <- function(.data, ..., .dots = list()) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
+ do(.data, !!! dots)
+}
diff --git a/R/sample.R b/R/sample.R
index 9b730f6..dda5bff 100644
--- a/R/sample.R
+++ b/R/sample.R
@@ -1,20 +1,25 @@
-#' Sample n rows from a table.
+#' Sample n rows from a table
#'
-#' This is a wrapper around \code{\link{sample.int}} to make it easy to
+#' This is a wrapper around [sample.int()] to make it easy to
#' select random rows from a table. It currently only works for local
#' tbls.
#'
#' @param tbl tbl of data.
-#' @param size For \code{sample_n}, the number of rows to select.
-#' For \code{sample_frac}, the fraction of rows to select.
-#' If \code{tbl} is grouped, \code{size} applies to each group.
+#' @param size For `sample_n()`, the number of rows to select.
+#' For `sample_frac()`, the fraction of rows to select.
+#' If `tbl` is grouped, `size` applies to each group.
#' @param replace Sample with or without replacement?
-#' @param weight Sampling weights. This expression is evaluated in the
-#' context of the data frame. It must return a vector of non-negative
-#' numbers the same length as the input. Weights are automatically
-#' standardised to sum to 1.
-#' @param .env Environment in which to look for non-data names used in
-#' \code{weight}. Non-default settings for experts only.
+#' @param weight Sampling weights. This must evaluate to a vector of
+#' non-negative numbers the same length as the input. Weights are
+#' automatically standardised to sum to 1.
+#'
+#' This argument is automatically [quoted][rlang::quo] and later
+#' [evaluated][rlang::eval_tidy] in the context of the data
+#' frame. It supports [unquoting][rlang::quasiquotation]. See
+#' `vignette("programming")` for an introduction to these concepts.
+#' @param .env This variable is deprecated and no longer has any
+#' effect. To evaluate `weight` in a particular context, you can
+#' now unquote a [quosure][rlang::quosure].
#' @name sample
#' @examples
#' by_cyl <- mtcars %>% group_by(cyl)
@@ -42,15 +47,13 @@ NULL
#' @rdname sample
#' @export
-sample_n <- function(tbl, size, replace = FALSE, weight = NULL,
- .env = parent.frame()) {
+sample_n <- function(tbl, size, replace = FALSE, weight = NULL, .env = NULL) {
UseMethod("sample_n")
}
#' @rdname sample
#' @export
-sample_frac <- function(tbl, size = 1, replace = FALSE, weight = NULL,
- .env = parent.frame()) {
+sample_frac <- function(tbl, size = 1, replace = FALSE, weight = NULL, .env = NULL) {
UseMethod("sample_frac")
}
@@ -65,16 +68,14 @@ sample_frac <- function(tbl, size = 1, replace = FALSE, weight = NULL,
sample_n.default <- function(tbl, size, replace = FALSE, weight = NULL,
.env = parent.frame()) {
- stop("Don't know how to sample from objects of class ", class(tbl)[1],
- call. = FALSE)
+ bad_args("tbl", "must be a data frame, not {fmt_classes(tbl)}")
}
#' @export
sample_frac.default <- function(tbl, size = 1, replace = FALSE, weight = NULL,
- .env = parent.frame()) {
+ .env = parent.frame()) {
- stop("Don't know how to sample from objects of class ", class(tbl)[1],
- call. = FALSE)
+ bad_args("tbl", "must be a data frame, not {fmt_classes(tbl)}")
}
# Helper functions -------------------------------------------------------------
@@ -83,13 +84,17 @@ check_weight <- function(x, n) {
if (is.null(x)) return()
if (!is.numeric(x)) {
- stop("Weights must be numeric", call. = FALSE)
+ bad_args("weight", "must be a numeric, not {type_of(x)}")
}
if (any(x < 0)) {
- stop("Weights must all be greater than 0", call. = FALSE)
+ bad_args("weight", "must be a vector with all values nonnegative, ",
+ "not {x[x < 0][[1]]}"
+ )
}
if (length(x) != n) {
- stop("Weights must be same length as data (", n, ")", call. = FALSE)
+ bad_args("weight", "must be a length {n} (same as data), ",
+ "not {length(x)}"
+ )
}
x / sum(x)
@@ -98,6 +103,15 @@ check_weight <- function(x, n) {
check_size <- function(size, n, replace = FALSE) {
if (size <= n || replace) return()
- stop("Sample size (", size, ") greater than population size (", n, ").",
- " Do you want replace = TRUE?", call. = FALSE)
+ bad_args("size", "must be less or equal than {n} (size of data), ",
+ "set `replace` = TRUE to use sampling with replacement"
+ )
+}
+
+check_frac <- function(size, replace = FALSE) {
+ if (size <= 1 || replace) return()
+
+ bad_args("size", "of sampled fraction must be less or equal to one, ",
+ "set `replace` = TRUE to use sampling with replacement"
+ )
}
diff --git a/R/select-utils.R b/R/select-utils.R
index 3f84566..077b8bb 100644
--- a/R/select-utils.R
+++ b/R/select-utils.R
@@ -1,24 +1,22 @@
#' Select helpers
#'
#' These functions allow you to select variables based on their names.
-#' \itemize{
-#' \item \code{starts_with()}: starts with a prefix
-#' \item \code{ends_with()}: ends with a prefix
-#' \item \code{contains()}: contains a literal string
-#' \item \code{matches()}: matches a regular expression
-#' \item \code{num_range()}: a numerical range like x01, x02, x03.
-#' \item \code{one_of()}: variables in character vector.
-#' \item \code{everything()}: all variables.
-#' }
+#' * `starts_with()`: starts with a prefix
+#' * `ends_with()`: ends with a prefix
+#' * `contains()`: contains a literal string
+#' * `matches()`: matches a regular expression
+#' * `num_range()`: a numerical range like x01, x02, x03.
+#' * `one_of()`: variables in character vector.
+#' * `everything()`: all variables.
#'
#' @param match A string.
-#' @param ignore.case If \code{TRUE}, the default, ignores case when matching
+#' @param ignore.case If `TRUE`, the default, ignores case when matching
#' names.
#' @param vars A character vector of variable names. When called from inside
-#' \code{\link{select}()} these are automatically set to the names of the
+#' [select()] these are automatically set to the names of the
#' table.
#' @name select_helpers
-#' @return An integer vector given the position of the matched variables.
+#' @return An integer vector giving the position of the matched variables.
#' @examples
#' iris <- tbl_df(iris) # so it prints a little nicer
#' select(iris, starts_with("Petal"))
@@ -31,24 +29,27 @@
#' select(iris, one_of(vars))
NULL
-cur_vars_env <- new.env()
+cur_vars_env <- child_env(NULL)
set_current_vars <- function(x) {
- stopifnot(is.character(x))
+ stopifnot(is_character(x) || is_null(x))
+
+ old <- cur_vars_env$selected
cur_vars_env$selected <- x
-}
-reset_current_vars <- function() {
- set_current_vars(character())
+
+ invisible(old)
}
#' @export
#' @rdname select_helpers
-current_vars <- function() cur_vars_env$selected
+current_vars <- function() {
+ cur_vars_env$selected %||% abort("Variable context not set")
+}
#' @export
#' @rdname select_helpers
starts_with <- function(match, ignore.case = TRUE, vars = current_vars()) {
- stopifnot(is.string(match), !is.na(match), nchar(match) > 0)
+ stopifnot(is_string(match), !is.na(match), nchar(match) > 0)
if (ignore.case) match <- tolower(match)
n <- nchar(match)
@@ -60,7 +61,7 @@ starts_with <- function(match, ignore.case = TRUE, vars = current_vars()) {
#' @export
#' @rdname select_helpers
ends_with <- function(match, ignore.case = TRUE, vars = current_vars()) {
- stopifnot(is.string(match), !is.na(match), nchar(match) > 0)
+ stopifnot(is_string(match), !is.na(match), nchar(match) > 0)
if (ignore.case) match <- tolower(match)
n <- nchar(match)
@@ -74,7 +75,7 @@ ends_with <- function(match, ignore.case = TRUE, vars = current_vars()) {
#' @export
#' @rdname select_helpers
contains <- function(match, ignore.case = TRUE, vars = current_vars()) {
- stopifnot(is.string(match), nchar(match) > 0)
+ stopifnot(is_string(match), nchar(match) > 0)
if (ignore.case) {
vars <- tolower(vars)
@@ -86,7 +87,7 @@ contains <- function(match, ignore.case = TRUE, vars = current_vars()) {
#' @export
#' @rdname select_helpers
matches <- function(match, ignore.case = TRUE, vars = current_vars()) {
- stopifnot(is.string(match), nchar(match) > 0)
+ stopifnot(is_string(match), nchar(match) > 0)
grep_vars(match, vars, ignore.case = ignore.case)
}
@@ -94,11 +95,11 @@ matches <- function(match, ignore.case = TRUE, vars = current_vars()) {
#' @export
#' @rdname select_helpers
#' @param prefix A prefix that starts the numeric range.
-#' @param range A sequence of integers, like \code{1:5}
+#' @param range A sequence of integers, like `1:5`
#' @param width Optionally, the "width" of the numeric range. For example,
#' a range of 2 gives "01", a range of three "001", etc.
num_range <- function(prefix, range, width = NULL, vars = current_vars()) {
- if (!is.null(width)) {
+ if (!is_null(width)) {
range <- sprintf(paste0("%0", width, "d"), range)
}
match_vars(paste0(prefix, range), vars)
@@ -110,13 +111,13 @@ num_range <- function(prefix, range, width = NULL, vars = current_vars()) {
one_of <- function(..., vars = current_vars()) {
keep <- c(...)
- if (!is.character(keep)) {
- stop("`c(...)` must be a character vector", call. = FALSE)
+ if (!is_character(keep)) {
+ bad("All arguments must be character vectors, not {type_of(keep)}")
}
if (!all(keep %in% vars)) {
bad <- setdiff(keep, vars)
- warning("Unknown variables: ", paste0("`", bad, "`", collapse = ", "))
+ warn(glue("Unknown variables: ", paste0("`", bad, "`", collapse = ", ")))
}
match_vars(keep, vars)
@@ -130,20 +131,13 @@ everything <- function(vars = current_vars()) {
match_vars <- function(needle, haystack) {
x <- match(needle, haystack)
- x <- x[!is.na(x)]
-
- fill_out(x, haystack)
+ x[!is.na(x)]
}
grep_vars <- function(needle, haystack, ...) {
- fill_out(grep(needle, haystack, ...), haystack)
+ grep(needle, haystack, ...)
}
which_vars <- function(needle, haystack) {
- fill_out(which(needle == haystack), haystack)
-}
-
-fill_out <- function(x, haystack) {
- if (length(x) > 0) return(x)
- -seq_along(haystack)
+ which(needle == haystack)
}
diff --git a/R/select-var.R b/R/select-var.R
new file mode 100644
index 0000000..b534184
--- /dev/null
+++ b/R/select-var.R
@@ -0,0 +1,65 @@
+#' Select variable
+#'
+#' This function powers [pull()] and various functions of the tidyr
+#' package. It is similar to [select_vars()] but returns only one
+#' column name and has slightly different semantics: it allows
+#' negative numbers to select columns from the end.
+#'
+#' @inheritParams select_vars
+#' @param var A variable specified as:
+#' * a literal variable name
+#' * a positive integer, giving the position counting from the left
+#' * a negative integer, giving the position counting from the right.
+#'
+#' The default returns the last column (on the assumption that's the
+#' column you've created most recently).
+#'
+#' This argument is taken by expression and supports
+#' [quasiquotation][rlang::quasiquotation] (you can unquote column
+#' names and column positions).
+#' @return The selected column name as an unnamed string.
+#' @seealso [pull()], [select_vars()]
+#' @export
+#' @keywords internal
+#' @examples
+#' # It takes its argument by expression:
+#' select_var(letters, c)
+#'
+#' # Negative numbers select from the end:
+#' select_var(letters, -3)
+#'
+#' # You can unquote variables:
+#' var <- 10
+#' select_var(letters, !! var)
+select_var <- function(vars, var = -1) {
+ var_env <- set_names(as_list(seq_along(vars)), vars)
+ var <- eval_tidy(enquo(var), var_env)
+ n <- length(vars)
+
+ # Fall degenerate values like `Inf` through integerish branch
+ if (is_double(var, 1) && !is.finite(var)) {
+ var <- na_int
+ }
+
+ if (is_string(var)) {
+ pos <- match_var(var, vars)
+ } else if (is_integerish(var, 1)) {
+ if (is_na(var) || abs(var) > n || var == 0L) {
+ abort(glue(
+ "`var` must be a value between {-n} and {n} (excluding zero), not {var}"
+ ))
+ }
+ if (var < 0) {
+ pos <- var + n + 1
+ } else {
+ pos <- var
+ }
+ } else {
+ type <- friendly_type(type_of(var))
+ abort(glue(
+ "`var` must evaluate to a single number or a column name, not {type}"
+ ))
+ }
+
+ vars[[pos]]
+}
diff --git a/R/select-vars.R b/R/select-vars.R
index 5486436..56e6c26 100644
--- a/R/select-vars.R
+++ b/R/select-vars.R
@@ -1,12 +1,31 @@
#' Select variables.
#'
-#' These functions power \code{\link{select}()} and \code{\link{rename}()}.
+#' These functions power [select()] and [rename()].
+#'
+#' For historic reasons, the `vars` and `include` arguments are not
+#' prefixed with `.`. This means that any argument starting with `v`
+#' might partial-match on `vars` if it is not explicitly named. Also
+#' `...` cannot accept arguments named `exclude` or `include`. You can
+#' enquose and splice the dots to work around these limitations (see
+#' examples).
#'
#' @param vars A character vector of existing column names.
-#' @param ...,args Expressions to compute. \code{select_vars} and
-#' \code{rename_vars}
+#' @param ...,args Expressions to compute
+#'
+#' These arguments are automatically [quoted][rlang::quo] and
+#' [evaluated][rlang::eval_tidy] in a context where elements of
+#' `vars` are objects representing their positions within
+#' `vars`. They support [unquoting][rlang::quasiquotation] and
+#' splicing. See `vignette("programming")` for an introduction to
+#' these concepts.
+#'
+#' Note that except for `:`, `-` and `c()`, all complex expressions
+#' are evaluated outside that context. This is to prevent accidental
+#' matching to `vars` elements when you refer to variables from the
+#' calling context.
#' @param include,exclude Character vector of column names to always
#' include/exclude.
+#' @seealso [select_var()]
#' @export
#' @keywords internal
#' @return A named character vector. Values are existing column names,
@@ -39,55 +58,81 @@
#' # Rename variables preserving all existing
#' rename_vars(names(iris), petal_length = Petal.Length)
#'
-#' # Standard evaluation -------------------------------------------------------
-#' # You can use names, calls, formulas (or lists of), or a character vector
-#' select_vars_(names(iris), list(~Petal.Length))
-#' select_vars_(names(iris), list(quote(Petal.Length)))
-#' select_vars_(names(iris), "Petal.Length")
+#' # You can unquote names or formulas (or lists of)
+#' select_vars(names(iris), !!! list(quo(Petal.Length)))
+#' select_vars(names(iris), !! quote(Petal.Length))
+#'
+#' # The .data pronoun is available:
+#' select_vars(names(mtcars), .data$cyl)
+#' select_vars(names(mtcars), .data$mpg : .data$disp)
+#'
+#' # However it isn't available within calls since those are evaluated
+#' # outside of the data context. This would fail if run:
+#' # select_vars(names(mtcars), identical(.data$cyl))
+#'
+#'
+#' # If you're writing a wrapper around select_vars(), pass the dots
+#' # via splicing to avoid matching dotted arguments to select_vars()
+#' # named arguments (`vars`, `include` and `exclude`):
+#' wrapper <- function(...) {
+#' select_vars(names(mtcars), !!! quos(...))
+#' }
+#'
+#' # This won't partial-match on `vars`:
+#' wrapper(var = cyl)
+#'
+#' # This won't match on `include`:
+#' wrapper(include = cyl)
select_vars <- function(vars, ..., include = character(), exclude = character()) {
- args <- lazyeval::lazy_dots(...)
- select_vars_(vars, args, include = include, exclude = exclude)
-}
-
-#' @rdname select_vars
-#' @export
-select_vars_ <- function(vars, args, include = character(), exclude = character()) {
+ quos <- quos(...)
- if (length(args) == 0) {
+ if (is_empty(quos)) {
vars <- setdiff(include, exclude)
- return(setNames(vars, vars))
+ return(set_names(vars, vars))
}
- # Set current_vars so avaialble to select_helpers
- set_current_vars(vars)
- on.exit(reset_current_vars(), add = TRUE)
+ # Set current_vars so available to select_helpers
+ old <- set_current_vars(vars)
+ on.exit(set_current_vars(old), add = TRUE)
# Map variable names to their positions: this keeps integer semantics
- args <- lazyeval::as.lazy_dots(args)
- names_list <- setNames(as.list(seq_along(vars)), vars)
-
- ind_list <- lazyeval::lazy_eval(args, names_list)
- names(ind_list) <- names2(args)
-
- is_numeric <- vapply(ind_list, is.numeric, logical(1))
- if (any(!is_numeric)) {
- bad_inputs <- lapply(args[!is_numeric], `[[`, "expr")
- labels <- vapply(bad_inputs, deparse_trunc, character(1))
-
- stop("All select() inputs must resolve to integer column positions.\n",
- "The following do not:\n", paste("* ", labels, collapse = "\n"),
- call. = FALSE)
+ names_list <- set_names(as.list(seq_along(vars)), vars)
+
+ # if the first selector is exclusive (negative), start with all columns
+ first <- f_rhs(quos[[1]])
+ initial_case <- if (is_negated(first)) list(seq_along(vars)) else integer(0)
+
+ # Evaluate symbols in an environment where columns are bound, but
+ # not calls (select helpers are scoped in the calling environment)
+ is_helper <- map_lgl(quos, quo_is_helper)
+ ind_list <- map_if(quos, is_helper, eval_tidy)
+ ind_list <- map_if(ind_list, !is_helper, eval_tidy, data = names_list)
+
+ ind_list <- c(initial_case, ind_list)
+ names(ind_list) <- c(names2(initial_case), names2(quos))
+
+ # Match strings to variable positions
+ ind_list <- map_if(ind_list, is_character, match_var, table = vars)
+
+ is_integerish <- map_lgl(ind_list, is_integerish)
+ if (any(!is_integerish)) {
+ bad <- quos[!is_integerish]
+ first <- ind_list[!is_integerish][[1]]
+ first_type <- friendly_type(type_of(first))
+ bad_calls(bad,
+ "must resolve to integer column positions, not {first_type}"
+ )
}
incl <- combine_vars(vars, ind_list)
# Include/exclude specified variables
- sel <- setNames(vars[incl], names(incl))
+ sel <- set_names(vars[incl], names(incl))
sel <- c(setdiff2(include, sel), sel)
sel <- setdiff2(sel, exclude)
# Ensure all output vars named
- if (length(sel) == 0) {
+ if (is_empty(sel)) {
names(sel) <- sel
} else {
unnamed <- names2(sel) == ""
@@ -97,48 +142,91 @@ select_vars_ <- function(vars, args, include = character(), exclude = character(
sel
}
+quo_is_helper <- function(quo) {
+ expr <- f_rhs(quo)
-setdiff2 <- function(x, y) {
- x[match(x, y, 0L) == 0L]
+ if (!is_lang(expr)) {
+ return(FALSE)
+ }
+
+ if (is_data_pronoun(expr)) {
+ return(FALSE)
+ }
+
+ if (is_lang(expr, c("-", ":", "c"))) {
+ return(FALSE)
+ }
+
+ TRUE
+}
+match_var <- function(chr, table) {
+ pos <- match(chr, table)
+ if (any(are_na(pos))) {
+ chr <- glue::collapse(chr[are_na(pos)], ", ")
+ abort(glue("Strings must match column names. Unknown columns: {chr}"))
+ }
+ pos
}
+#' @rdname se-deprecated
+#' @inheritParams select_vars
#' @export
-#' @rdname select_vars
-rename_vars <- function(vars, ...) {
- rename_vars_(vars, lazyeval::lazy_dots(...))
+select_vars_ <- function(vars, args, include = character(), exclude = character()) {
+ args <- compat_lazy_dots(args, caller_env())
+ select_vars(vars, !!! args, include = include, exclude = exclude)
+}
+
+setdiff2 <- function(x, y) {
+ x[match(x, y, 0L) == 0L]
}
#' @export
#' @rdname select_vars
-rename_vars_ <- function(vars, args) {
- if (any(names2(args) == "")) {
- stop("All arguments to `rename()` must be named.", call. = FALSE)
- }
-
- args <- lazyeval::as.lazy_dots(args)
- is_name <- vapply(args, function(x) is.name(x$expr), logical(1))
- if (!all(is_name)) {
- n <- sum(!is_name)
- bad <- paste0("`", names(args)[!is_name], "`", collapse = ", ")
-
- stop(
- "Arguments to `rename()` must be unquoted variable names.\n",
- sprintf(ngettext(n, "Argument %s is not.", "Arguments %s are not."), bad),
- call. = FALSE
- )
+#' @param strict If `TRUE`, will throw an error if you attempt to rename a
+#' variable that doesn't exist.
+rename_vars <- function(vars, ..., strict = TRUE) {
+ exprs <- exprs(...)
+ if (any(names2(exprs) == "")) {
+ abort("All arguments must be named")
}
- old_vars <- vapply(args, function(x) as.character(x$expr), character(1))
- new_vars <- names(args)
+ old_vars <- map2(exprs, names(exprs), switch_rename)
+ new_vars <- names(exprs)
unknown_vars <- setdiff(old_vars, vars)
- if (length(unknown_vars) > 0) {
- stop("Unknown variables: ", paste0(unknown_vars, collapse = ", "), ".",
- call. = FALSE)
+ if (strict && length(unknown_vars) > 0) {
+ bad_args(unknown_vars, "contains unknown variables")
}
- select <- setNames(vars, vars)
+ select <- set_names(vars, vars)
names(select)[match(old_vars, vars)] <- new_vars
select
}
+#' @export
+#' @rdname se-deprecated
+rename_vars_ <- function(vars, args) {
+ args <- compat_lazy_dots(args, caller_env())
+ rename_vars(vars, !!! args)
+}
+
+# FIXME: that's not a tidy implementation yet because we need to
+# handle non-existing symbols silently when `strict = FALSE`
+switch_rename <- function(expr, name) {
+ switch_type(expr,
+ string = ,
+ symbol =
+ return(as_string(expr)),
+ language =
+ if (is_data_pronoun(expr)) {
+ args <- node_cdr(expr)
+ return(switch_rename(node_cadr(args)))
+ } else {
+ abort("Expressions are currently not supported in `rename()`")
+ }
+ )
+
+ actual_type <- friendly_type(type_of(expr))
+ named_call <- ll(!! name := expr)
+ bad_named_calls(named_call, "must be a symbol or a string, not {actual_type}")
+}
diff --git a/R/sets.r b/R/sets.r
index 7395afe..ccf158d 100644
--- a/R/sets.r
+++ b/R/sets.r
@@ -1,4 +1,4 @@
-#' Set operations.
+#' Set operations
#'
#' These functions override the set functions provided in base to make them
#' generic so that efficient versions for data frames and other tables can be
@@ -47,4 +47,3 @@ union_all.default <- function(x, y, ...) combine(x, y, ...)
setdiff.default <- function(x, y, ...) base::setdiff(x, y, ...)
#' @export
setequal.default <- function(x, y, ...) base::setequal(x, y, ...)
-
diff --git a/R/sql-build.R b/R/sql-build.R
deleted file mode 100644
index 4e42e36..0000000
--- a/R/sql-build.R
+++ /dev/null
@@ -1,205 +0,0 @@
-#' Build and render SQL from a sequence of lazy operations
-#'
-#' \code{sql_build} creates a \code{select_query} S3 object, that is rendered
-#' to a SQL string by \code{sql_render}. The output from \code{sql_build} is
-#' designed to be easy to test, as it's database diagnostic, and has
-#' a hierarchical structure.
-#'
-#' \code{sql_build} is generic over the lazy operations, \link{lazy_ops},
-#' and generates an S3 object that represents the query. \code{sql_render}
-#' takes a query object and then calls a function that is generic
-#' over the database. For example, \code{sql_build.op_mutate} generates
-#' a \code{select_query}, and \code{sql_render.select_query} calls
-#' \code{sql_select}, which has different methods for different databases.
-#' The default methods should generate ANSI 92 SQL where possible, so you
-#' backends only need to override the methods if the backend is not ANSI
-#' compliant.
-#'
-#' @export
-#' @keywords internal
-#' @param op A sequence of lazy operations
-#' @param con A database connection. The default \code{NULL} uses a set of
-#' rules that should be very similar to ANSI 92, and allows for testing
-#' without an active database connection.
-#' @param ... Other arguments passed on to the methods. Not currently used.
-sql_build <- function(op, con, ...) {
- UseMethod("sql_build")
-}
-
-#' @export
-sql_build.tbl_sql <- function(op, con, ...) {
- sql_build(op$ops, op$con, ...)
-}
-
-#' @export
-sql_build.tbl_lazy <- function(op, con = NULL, ...) {
- sql_build(op$ops, con, ...)
-}
-
-# Base ops --------------------------------------------------------
-
-#' @export
-sql_build.op_base_remote <- function(op, con, ...) {
- op$x
-}
-
-#' @export
-sql_build.op_base_local <- function(op, con, ...) {
- ident("df")
-}
-
-# Single table ops --------------------------------------------------------
-
-#' @export
-sql_build.op_select <- function(op, con, ...) {
- vars <- select_vars_(op_vars(op$x), op$dots, include = op_grps(op$x))
- select_query(sql_build(op$x, con), ident(vars))
-}
-
-#' @export
-sql_build.op_rename <- function(op, con, ...) {
- vars <- rename_vars_(op_vars(op$x), op$dots)
- select_query(sql_build(op$x, con), ident(vars))
-}
-
-#' @export
-sql_build.op_arrange <- function(op, con, ...) {
- order_vars <- translate_sql_(op$dots, con, op_vars(op$x))
- group_vars <- c.sql(ident(op_grps(op$x)), con = con)
-
- select_query(sql_build(op$x, con), order_by = order_vars)
-}
-
-#' @export
-sql_build.op_summarise <- function(op, con, ...) {
- select_vars <- translate_sql_(op$dots, con, op_vars(op$x), window = FALSE)
- group_vars <- c.sql(ident(op_grps(op$x)), con = con)
-
- select_query(
- sql_build(op$x, con),
- select = c.sql(group_vars, select_vars, con = con),
- group_by = group_vars
- )
-}
-
-#' @export
-sql_build.op_mutate <- function(op, con, ...) {
- vars <- op_vars(op$x)
-
- new_vars <- translate_sql_(op$dots, con, vars,
- vars_group = op_grps(op),
- vars_order = op_sort(op)
- )
- old_vars <- ident(setdiff(vars, names(new_vars)))
-
- select_query(
- sql_build(op$x, con),
- select = c.sql(old_vars, new_vars, con = con)
- )
-}
-
-#' @export
-sql_build.op_head <- function(op, con, ...) {
- select_query(sql_build(op$x, con), limit = op$args$n)
-}
-
-#' @export
-sql_build.op_group_by <- function(op, con, ...) {
- sql_build(op$x, con, ...)
-}
-
-#' @export
-sql_build.op_ungroup <- function(op, con, ...) {
- sql_build(op$x, con, ...)
-}
-
-#' @export
-sql_build.op_filter <- function(op, con, ...) {
- vars <- op_vars(op$x)
-
- if (!uses_window_fun(op$dots, con)) {
- where_sql <- translate_sql_(op$dots, con, vars = vars)
-
- select_query(
- sql_build(op$x, con),
- where = where_sql
- )
- } else {
- # Do partial evaluation, then extract out window functions
- expr <- partial_eval2(op$dots, vars)
- where <- translate_window_where_all(expr, ls(sql_translate_env(con)$window))
-
- # Convert where$expr back to a lazy dots object, and then
- # create mutate operation
- mutate_dots <- lapply(where$comp, lazyeval::as.lazy)
- mutated <- sql_build(op_single("mutate", op$x, dots = mutate_dots), con)
- where_sql <- translate_sql_(where$expr, con = con, vars = vars)
-
- select_query(mutated, select = ident(vars), where = where_sql)
- }
-
-}
-
-#' @export
-sql_build.op_distinct <- function(op, con, ...) {
- if (length(op$dots) == 0) {
- select_query(
- sql_build(op$x, con),
- distinct = TRUE
- )
- } else {
- if (op$args$.keep_all) {
- stop("Can't calculate distinct only on specified columns with SQL unless .keep_all is FALSE",
- call. = FALSE)
- }
-
- group_vars <- c.sql(ident(names(op$dots)), con = con)
- select_query(
- sql_build(op$x, con),
- select = group_vars,
- group_by = group_vars
- )
- }
-}
-
-# Dual table ops --------------------------------------------------------
-
-#' @export
-sql_build.op_join <- function(op, con, ...) {
- # Ensure tables have unique names
- x_names <- op_vars(op$x)
- y_names <- op_vars(op$y)
- by <- op$args$by
-
- uniques <- unique_names(x_names, y_names, by = by, suffix = op$args$suffix)
-
- if (is.null(uniques)) {
- x <- op$x
- y <- op$y
- } else {
- # TODO: it would be better to construct an explicit FROM statement
- # that used the table names to disambiguate the fields names: this
- # would remove a layer of subqueries and would make sql_join more
- # flexible.
- x <- select_(op$x, .dots = setNames(x_names, uniques$x))
- y <- select_(op$y, .dots = setNames(y_names, uniques$y))
-
- by$x <- unname(uniques$x[by$x])
- by$y <- unname(uniques$y[by$y])
- }
-
- join_query(x, y,
- type = op$args$type,
- by = by
- )
-}
-
-#' @export
-sql_build.op_semi_join <- function(op, con, ...) {
- semi_join_query(op$x, op$y, anti = op$args$anti, by = op$args$by)
-}
-
-#' @export
-sql_build.op_set_op <- function(op, con, ...) {
- set_op_query(op$x, op$y, type = op$args$type)
-}
diff --git a/R/sql-escape.r b/R/sql-escape.r
deleted file mode 100644
index 937e168..0000000
--- a/R/sql-escape.r
+++ /dev/null
@@ -1,248 +0,0 @@
-#' SQL escaping.
-#'
-#' These functions are critical when writing functions that translate R
-#' functions to sql functions. Typically a conversion function should escape
-#' all it's inputs and return an sql object.
-#'
-#' @param ... Character vectors that will be combined into a single SQL
-#' expression. \code{ident} flags its input as a identifier, to ensure that
-#' it gets the correct quoting.
-#' @param x An object to escape. Existing sql vectors will be left as is,
-#' character vectors are escaped with single quotes, numeric vectors have
-#' trailing \code{.0} added if they're whole numbers, identifiers are
-#' escaped with double quotes.
-#' @param parens,collapse Controls behaviour when multiple values are supplied.
-#' \code{parens} should be a logical flag, or if \code{NA}, will wrap in
-#' parens if length > 1.
-#'
-#' Default behaviour: lists are always wrapped in parens and separated by
-#' commas, identifiers are separated by commas and never wrapped,
-#' atomic vectors are separated by spaces and wrapped in parens if needed.
-#' @keywords internal
-#' @export
-#' @examples
-#' # Doubles vs. integers
-#' escape(1:5)
-#' escape(c(1, 5.4))
-#'
-#' # String vs known sql vs. sql identifier
-#' escape("X")
-#' escape(sql("X"))
-#' escape(ident("X"))
-#'
-#' # Escaping is idempotent
-#' escape("X")
-#' escape(escape("X"))
-#' escape(escape(escape("X")))
-sql <- function(...) {
- x <- c(...)
- if (length(x) == 0) {
- structure(character(), class = c("sql", "character"))
- } else {
- stopifnot(is.character(x))
- structure(x, class = c("sql", "character"))
- }
-}
-
-#' @export
-#' @rdname sql
-ident <- function(...) {
- x <- c(...)
- if (length(x) == 0) return(sql())
- stopifnot(is.character(x))
-
- structure(x, class = c("ident", "sql", "character"))
-}
-
-#' @export
-c.sql <- function(..., drop_null = FALSE, con = NULL) {
- input <- list(...)
- if (drop_null) input <- compact(input)
-
- out <- unlist(lapply(input, escape, collapse = NULL, con = con))
- sql(out)
-}
-
-
-#' @export
-unique.sql <- function(x, ...) {
- sql(NextMethod())
-}
-
-
-setOldClass(c("sql", "character"))
-setOldClass(c("ident", "sql", "character"))
-
-#' @rdname sql
-#' @export
-is.sql <- function(x) inherits(x, "sql")
-
-#' @rdname sql
-#' @export
-is.ident <- function(x) inherits(x, "ident")
-
-
-#' @export
-print.sql <- function(x, ...) cat(format(x, ...), sep = "\n")
-#' @export
-format.sql <- function(x, ...) paste0("<SQL> ", x)
-#' @export
-format.ident <- function(x, ...) paste0("<VAR> ", escape(x))
-
-#' @rdname sql
-#' @export
-escape <- function(x, parens = NA, collapse = " ", con = NULL) {
- UseMethod("escape")
-}
-
-#' @export
-escape.ident <- function(x, parens = FALSE, collapse = ", ", con = NULL) {
- y <- sql_escape_ident(con, x)
- sql_vector(names_to_as(y, con), parens, collapse)
-}
-
-#' @export
-escape.logical <- function(x, parens = NA, collapse = ", ", con = NULL) {
- x <- as.character(x)
- x[is.na(x)] <- "NULL"
- sql_vector(x, parens, collapse)
-}
-
-#' @export
-escape.factor <- function(x, parens = NA, collapse = ", ", con = NULL) {
- x <- as.character(x)
- escape.character(x, parens = parens, collapse = collapse, con = con)
-}
-
-#' @export
-escape.Date <- function(x, parens = NA, collapse = ", ", con = NULL) {
- x <- as.character(x)
- escape.character(x, parens = parens, collapse = collapse, con = con)
-}
-
-#' @export
-escape.POSIXt <- function(x, parens = NA, collapse = ", ", con = NULL) {
- x <- strftime(x, "%Y-%m-%dT%H:%M:%OSZ", tz = "UTC")
- escape.character(x, parens = parens, collapse = collapse, con = con)
-}
-
-#' @export
-escape.character <- function(x, parens = NA, collapse = ", ", con = NULL) {
- sql_vector(sql_escape_string(con, x), parens, collapse, con = con)
-}
-
-#' @export
-escape.double <- function(x, parens = NA, collapse = ", ", con = NULL) {
- missing <- is.na(x)
- x <- ifelse(is.wholenumber(x), sprintf("%.1f", x), as.character(x))
- x[missing] <- "NULL"
-
- sql_vector(x, parens, collapse)
-}
-
-#' @export
-escape.integer <- function(x, parens = NA, collapse = ", ", con = NULL) {
- x[is.na(x)] <- "NULL"
- sql_vector(x, parens, collapse)
-}
-
-#' @export
-escape.NULL <- function(x, parens = NA, collapse = " ", con = NULL) {
- sql("NULL")
-}
-
-#' @export
-escape.sql <- function(x, parens = NULL, collapse = NULL, con = NULL) {
- sql_vector(x, isTRUE(parens), collapse, con = con)
-}
-
-#' @export
-escape.list <- function(x, parens = TRUE, collapse = ", ", con = NULL) {
- pieces <- vapply(x, escape, character(1), con = con)
- sql_vector(pieces, parens, collapse)
-}
-
-#' @export
-#' @rdname sql
-sql_vector <- function(x, parens = NA, collapse = " ", con = NULL) {
- if (is.na(parens)) {
- parens <- length(x) > 1L
- }
-
- x <- names_to_as(x, con = con)
- x <- paste(x, collapse = collapse)
- if (parens) x <- paste0("(", x, ")")
- sql(x)
-}
-
-names_to_as <- function(x, con = NULL) {
- names <- names2(x)
- as <- ifelse(names == '', '', paste0(' AS ', sql_escape_ident(con, names)))
-
- paste0(x, as)
-}
-
-
-#' Build a SQL string.
-#'
-#' This is a convenience function that should prevent sql injection attacks
-#' (which in the context of dplyr are most likely to be accidental not
-#' deliberate) by automatically escaping all expressions in the input, while
-#' treating bare strings as sql. This is unlikely to prevent any serious
-#' attack, but should make it unlikely that you produce invalid sql.
-#'
-#' @param ... input to convert to SQL. Use \code{\link{sql}} to preserve
-#' user input as is (dangerous), and \code{\link{ident}} to label user
-#' input as sql identifiers (safe)
-#' @param .env the environment in which to evalute the arguments. Should not
-#' be needed in typical use.
-#' @param con database connection; used to select correct quoting characters.
-#' @export
-#' @examples
-#' build_sql("SELECT * FROM TABLE")
-#' x <- "TABLE"
-#' build_sql("SELECT * FROM ", x)
-#' build_sql("SELECT * FROM ", ident(x))
-#' build_sql("SELECT * FROM ", sql(x))
-#'
-#' # http://xkcd.com/327/
-#' name <- "Robert'); DROP TABLE Students;--"
-#' build_sql("INSERT INTO Students (Name) VALUES (", name, ")")
-build_sql <- function(..., .env = parent.frame(), con = NULL) {
- escape_expr <- function(x) {
- # If it's a string, leave it as is
- if (is.character(x)) return(x)
-
- val <- eval(x, .env)
- # Skip nulls, so you can use if statements like in paste
- if (is.null(val)) return("")
-
- escape(val, con = con)
- }
-
- pieces <- vapply(dots(...), escape_expr, character(1))
- sql(paste0(pieces, collapse = ""))
-}
-
-#' Helper function for quoting sql elements.
-#'
-#' If the quote character is present in the string, it will be doubled.
-#' \code{NA}s will be replaced with NULL.
-#'
-#' @export
-#' @param x Character vector to escape.
-#' @param quote Single quoting character.
-#' @export
-#' @keywords internal
-#' @examples
-#' sql_quote("abc", "'")
-#' sql_quote("I've had a good day", "'")
-#' sql_quote(c("abc", NA), "'")
-sql_quote <- function(x, quote) {
- y <- gsub(quote, paste0(quote, quote), x, fixed = TRUE)
- y <- paste0(quote, y, quote)
- y[is.na(x)] <- "NULL"
- names(y) <- names(x)
-
- y
-}
diff --git a/R/sql-generic.R b/R/sql-generic.R
deleted file mode 100644
index 9f42dd7..0000000
--- a/R/sql-generic.R
+++ /dev/null
@@ -1,184 +0,0 @@
-#' SQL generation.
-#'
-#' These generics are used to run build various SQL queries. A default method
-#' generates ANSI 92 compliant SQL, but variations in SQL across databases means
-#' that it's likely that a backend will require at least a few methods.
-#'
-#' @return An SQL string.
-#' @name backend_sql
-#' @param con A database connection.
-#' @keywords internal
-NULL
-
-#' @rdname backend_sql
-#' @export
-sql_select <- function(con, select, from, where = NULL, group_by = NULL,
- having = NULL, order_by = NULL, limit = NULL,
- distinct = FALSE, ...) {
- UseMethod("sql_select")
-}
-#' @export
-sql_select.default <- function(con, select, from, where = NULL,
- group_by = NULL, having = NULL,
- order_by = NULL,
- limit = NULL,
- distinct = FALSE,
- ...) {
- out <- vector("list", 7)
- names(out) <- c("select", "from", "where", "group_by", "having", "order_by",
- "limit")
-
- assert_that(is.character(select), length(select) > 0L)
- out$select <- build_sql(
- "SELECT ",
- if (distinct) sql("DISTINCT "),
- escape(select, collapse = ", ", con = con)
- )
-
- assert_that(is.character(from), length(from) == 1L)
- out$from <- build_sql("FROM ", from, con = con)
-
- if (length(where) > 0L) {
- assert_that(is.character(where))
-
- where_paren <- escape(where, parens = TRUE, con = con)
- out$where <- build_sql("WHERE ", sql_vector(where_paren, collapse = " AND "))
- }
-
- if (length(group_by) > 0L) {
- assert_that(is.character(group_by))
- out$group_by <- build_sql("GROUP BY ",
- escape(group_by, collapse = ", ", con = con))
- }
-
- if (length(having) > 0L) {
- assert_that(is.character(having))
- out$having <- build_sql("HAVING ",
- escape(having, collapse = ", ", con = con))
- }
-
- if (length(order_by) > 0L) {
- assert_that(is.character(order_by))
- out$order_by <- build_sql("ORDER BY ",
- escape(order_by, collapse = ", ", con = con))
- }
-
- if (!is.null(limit)) {
- assert_that(is.numeric(limit), length(limit) == 1L)
- out$limit <- build_sql("LIMIT ",
- sql(format(trunc(limit), scientific = FALSE)),
- con = con)
- }
-
- escape(unname(compact(out)), collapse = "\n", parens = FALSE, con = con)
-}
-
-#' @export
-#' @rdname backend_sql
-sql_subquery <- function(con, from, name = random_table_name(), ...) {
- UseMethod("sql_subquery")
-}
-#' @export
-sql_subquery.default <- function(con, from, name = unique_name(), ...) {
- if (is.ident(from)) {
- setNames(from, name)
- } else {
- build_sql("(", from, ") ", ident(name %||% random_table_name()), con = con)
- }
-}
-
-#' @rdname backend_sql
-#' @export
-sql_join <- function(con, x, y, type = "inner", by = NULL, ...) {
- UseMethod("sql_join")
-}
-#' @export
-sql_join.default <- function(con, x, y, type = "inner", by = NULL, ...) {
- join <- switch(type,
- left = sql("LEFT"),
- inner = sql("INNER"),
- right = sql("RIGHT"),
- full = sql("FULL"),
- stop("Unknown join type:", type, call. = FALSE)
- )
-
- using <- all(by$x == by$y)
-
- if (using) {
- cond <- build_sql("USING ", lapply(by$x, ident), con = con)
- } else {
- on <- sql_vector(paste0(sql_escape_ident(con, by$x), " = ", sql_escape_ident(con, by$y)),
- collapse = " AND ", parens = TRUE)
- cond <- build_sql("ON ", on, con = con)
- }
-
- build_sql(
- 'SELECT * FROM ',x, "\n\n",
- join, " JOIN\n\n" ,
- y, "\n\n",
- cond,
- con = con
- )
-}
-
-#' @rdname backend_sql
-#' @export
-sql_semi_join <- function(con, x, y, anti = FALSE, by = NULL, ...) {
- UseMethod("sql_semi_join")
-}
-#' @export
-sql_semi_join.default <- function(con, x, y, anti = FALSE, by = NULL, ...) {
- # X and Y are subqueries named _LEFT and _RIGHT
- left <- escape(ident("_LEFT"), con = con)
- right <- escape(ident("_RIGHT"), con = con)
- on <- sql_vector(
- paste0(
- left, ".", sql_escape_ident(con, by$x), " = ",
- right, ".", sql_escape_ident(con, by$y)
- ),
- collapse = " AND ",
- parens = TRUE,
- con = con
- )
-
- build_sql(
- 'SELECT * FROM ', x, '\n\n',
- 'WHERE ', if (anti) sql('NOT '), 'EXISTS (\n',
- ' SELECT 1 FROM ', y, '\n',
- ' WHERE ', on, '\n',
- ')',
- con = con
- )
-}
-
-#' @rdname backend_sql
-#' @export
-sql_set_op <- function(con, x, y, method) {
- UseMethod("sql_set_op")
-}
-#' @export
-sql_set_op.default <- function(con, x, y, method) {
- build_sql(
- x,
- "\n", sql(method), "\n",
- y
- )
-}
-
-#' @rdname backend_sql
-#' @export
-sql_escape_string <- function(con, x) UseMethod("sql_escape_string")
-
-#' @export
-sql_escape_string.default <- function(con, x) {
- sql_quote(x, "'")
-}
-
-#' @rdname backend_sql
-#' @export
-sql_escape_ident <- function(con, x) UseMethod("sql_escape_ident")
-
-#' @export
-sql_escape_ident.default <- function(con, x) {
- sql_quote(x, '"')
-}
diff --git a/R/sql-query.R b/R/sql-query.R
deleted file mode 100644
index 8b6ac0e..0000000
--- a/R/sql-query.R
+++ /dev/null
@@ -1,125 +0,0 @@
-
-
-# select_query ------------------------------------------------------------
-
-#' @export
-#' @rdname sql_build
-select_query <- function(from,
- select = sql("*"),
- where = character(),
- group_by = character(),
- having = character(),
- order_by = character(),
- limit = NULL,
- distinct = FALSE) {
-
- stopifnot(is.character(select))
- stopifnot(is.character(where))
- stopifnot(is.character(group_by))
- stopifnot(is.character(having))
- stopifnot(is.character(order_by))
- stopifnot(is.null(limit) || (is.numeric(limit) && length(limit) == 1L))
- stopifnot(is.logical(distinct), length(distinct) == 1L)
-
- structure(
- list(
- from = from,
- select = select,
- where = where,
- group_by = group_by,
- having = having,
- order_by = order_by,
- distinct = distinct,
- limit = limit
- ),
- class = c("select_query", "query")
- )
-}
-
-#' @export
-print.select_query <- function(x, ...) {
- cat("<SQL SELECT", if (x$distinct) " DISTINCT", ">\n", sep = "")
- cat("From: ", x$from, "\n", sep = "")
-
- if (length(x$select)) cat("Select: ", named_commas(x$select), "\n", sep = "")
- if (length(x$where)) cat("Where: ", named_commas(x$where), "\n", sep = "")
- if (length(x$group_by)) cat("Group by: ", named_commas(x$group_by), "\n", sep = "")
- if (length(x$order_by)) cat("Order by: ", named_commas(x$order_by), "\n", sep = "")
- if (length(x$having)) cat("Having: ", named_commas(x$having), "\n", sep = "")
- if (length(x$limit)) cat("Limit: ", x$limit, "\n", sep = "")
-}
-
-
-#' @export
-#' @rdname sql_build
-join_query <- function(x, y, type = "inner", by = NULL, suffix = c(".x", ".y")) {
- structure(
- list(
- x = x,
- y = y,
- type = type,
- by = by,
- suffix = suffix
- ),
- class = c("join_query", "query")
- )
-}
-
-#' @export
-print.join_query <- function(x, ...) {
- cat("<SQL JOIN (", toupper(x$type), ")>\n", sep = "")
- cat("By: ", paste0(x$by$x, "-", x$by$y, collapse = ", "), "\n", sep = "")
-
- cat(named_rule("X"), "\n", sep = "")
- print(x$x$ops)
- cat(named_rule("Y"), "\n", sep = "")
- print(x$y$ops)
-}
-
-#' @export
-#' @rdname sql_build
-semi_join_query <- function(x, y, anti = FALSE, by = NULL) {
- structure(
- list(
- x = x,
- y = y,
- anti = anti,
- by = by
- ),
- class = c("semi_join_query", "query")
- )
-}
-
-#' @export
-print.semi_join_query <- function(x, ...) {
- cat("<SQL ", if (x$anti) "ANTI" else "SEMI", " JOIN>\n", sep = "")
- cat("By: ", paste0(x$by$x, "-", x$by$y, collapse = ", "), "\n", sep = "")
-
- cat(named_rule("X"), "\n", sep = "")
- print(x$x$ops)
- cat(named_rule("Y"), "\n", sep = "")
- print(x$y$ops)
-}
-
-#' @export
-#' @rdname sql_build
-set_op_query <- function(x, y, type = type) {
- structure(
- list(
- x = x,
- y = y,
- type = type
- ),
- class = c("set_op_query", "query")
- )
-}
-
-#' @export
-print.set_op_query <- function(x, ...) {
- cat("<SQL ", x$type, ">\n", sep = "")
-
- cat(named_rule("X"), "\n", sep = "")
- print(x$x$ops)
- cat(named_rule("Y"), "\n", sep = "")
- print(x$y$ops)
-}
diff --git a/R/sql-render.R b/R/sql-render.R
deleted file mode 100644
index 772bef9..0000000
--- a/R/sql-render.R
+++ /dev/null
@@ -1,70 +0,0 @@
-#' @export
-#' @rdname sql_build
-sql_render <- function(query, con = NULL, ...) {
- UseMethod("sql_render")
-}
-
-#' @export
-sql_render.op <- function(query, con = NULL, ...) {
- sql_render(sql_build(query, ...), con = con, ...)
-}
-
-#' @export
-sql_render.tbl_sql <- function(query, con = NULL, ...) {
- sql_render(sql_build(query$ops, query$src$con, ...), con = query$src$con, ...)
-}
-
-#' @export
-sql_render.tbl_lazy <- function(query, con = NULL, ...) {
- sql_render(sql_build(query$ops, con = NULL, ...), con = NULL, ...)
-}
-
-#' @export
-sql_render.select_query <- function(query, con = NULL, ..., root = FALSE) {
- from <- sql_subquery(con, sql_render(query$from, con, ..., root = root), name = NULL)
-
- sql_select(
- con, query$select, from, where = query$where, group_by = query$group_by,
- having = query$having, order_by = query$order_by, limit = query$limit,
- distinct = query$distinct,
- ...
- )
-}
-
-#' @export
-sql_render.ident <- function(query, con = NULL, ..., root = TRUE) {
- if (root) {
- sql_select(con, sql("*"), query)
- } else {
- query
- }
-}
-
-#' @export
-sql_render.sql <- function(query, con = NULL, ...) {
- query
-}
-
-#' @export
-sql_render.join_query <- function(query, con = NULL, ..., root = FALSE) {
- from_x <- sql_subquery(con, sql_render(query$x, con, ..., root = root), name = NULL)
- from_y <- sql_subquery(con, sql_render(query$y, con, ..., root = root), name = NULL)
-
- sql_join(con, from_x, from_y, type = query$type, by = query$by)
-}
-
-#' @export
-sql_render.semi_join_query <- function(query, con = NULL, ..., root = FALSE) {
- from_x <- sql_subquery(con, sql_render(query$x, con, ..., root = root), name = "_LEFT")
- from_y <- sql_subquery(con, sql_render(query$y, con, ..., root = root), name = "_RIGHT")
-
- sql_semi_join(con, from_x, from_y, anti = query$anti, by = query$by)
-}
-
-#' @export
-sql_render.set_op_query <- function(query, con = NULL, ..., root = FALSE) {
- from_x <- sql_render(query$x, con, ..., root = TRUE)
- from_y <- sql_render(query$y, con, ..., root = TRUE)
-
- sql_set_op(con, from_x, from_y, method = query$type)
-}
diff --git a/R/sql-star.r b/R/sql-star.r
deleted file mode 100644
index f11bee1..0000000
--- a/R/sql-star.r
+++ /dev/null
@@ -1,15 +0,0 @@
-star <- function() quote(`*`)
-has_star <- function(x) any_apply(x, is.star)
-is.star <- function(x) identical(x, star())
-remove_star <- function(x) {
- if (is.null(x)) return(x)
-
- is_star <- vapply(x, is.star, logical(1))
- x[!is_star]
-}
-
-expand_star <- function(x, tbl) {
- if (!has_star(x)) return(x)
-
- c(remove_star(x), lapply(colnames(tbl), as.name))
-}
diff --git a/R/src-local.r b/R/src-local.r
index 65c14db..517101f 100644
--- a/R/src-local.r
+++ b/R/src-local.r
@@ -3,10 +3,10 @@
#' This is mainly useful for testing, since makes it possible to refer to
#' local and remote tables using exactly the same syntax.
#'
-#' Generally, \code{src_local} should not be called directly, but instead
+#' Generally, `src_local()` should not be called directly, but instead
#' one of the constructors should be used.
#'
-#' @param tbl name of the function used to generate \code{tbl} objects
+#' @param tbl name of the function used to generate `tbl` objects
#' @param pkg,env Either the name of a package or an environment object in
#' which to look for objects.
#' @keywords internal
@@ -17,12 +17,15 @@
#' }
src_local <- function(tbl, pkg = NULL, env = NULL) {
if (!xor(is.null(pkg), is.null(env))) {
- stop("Must supply exactly one of pkg and env", call. = FALSE)
+ glubort(NULL, "Exactly one of `pkg` and `env` must be non-NULL, ",
+ "not {(!is.null(pkg)) + (!is.null(env))}"
+ )
}
if (!is.null(pkg)) {
env <- getNamespaceInfo(pkg, "lazydata")
name <- paste0("<package: ", pkg, ">")
} else {
+ stopifnot(is.environment(env))
name <- utils::capture.output(print(env))
}
@@ -50,13 +53,23 @@ tbl.src_local <- function(src, from, ...) {
}
#' @export
-copy_to.src_local <- function(dest, df, name = deparse(substitute(df)), ...) {
+copy_to.src_local <- function(dest, df, name = deparse(substitute(df)),
+ overwrite = FALSE, ...) {
+
+ if (!overwrite && exists(name, envir = dest$env, inherits = FALSE)) {
+ glubort(NULL, "object with `name` = {fmt_obj(name)} must not already exist, ",
+ "unless `overwrite` = TRUE"
+ )
+ }
+
assign(name, envir = dest$env, df)
tbl(dest, name)
}
#' @export
format.src_local <- function(x, ...) {
- paste0("src: ", x$name, "\n",
- wrap("tbls: ", paste0(sort(src_tbls(x)), collapse = ", ")))
+ paste0(
+ "src: ", x$name, "\n",
+ wrap("tbls: ", paste0(sort(src_tbls(x)), collapse = ", "))
+ )
}
diff --git a/R/src-mysql.r b/R/src-mysql.r
deleted file mode 100644
index 10c3fe3..0000000
--- a/R/src-mysql.r
+++ /dev/null
@@ -1,225 +0,0 @@
-#' Connect to mysql/mariadb.
-#'
-#' Use \code{src_mysql} to connect to an existing mysql or mariadb database,
-#' and \code{tbl} to connect to tables within that database.
-#' If you are running a local mysqlql database, leave all parameters set as
-#' their defaults to connect. If you're connecting to a remote database,
-#' ask your database administrator for the values of these variables.
-#'
-#' @template db-info
-#' @param dbname Database name
-#' @param host,port Host name and port number of database
-#' @param user,password User name and password. Rather than supplying a
-#' username and password here, it's better to save them in \code{my.cnf},
-#' as described in \code{\link[RMySQL]{MySQL}}. In that case, supply
-#' \code{NULL} to both \code{user} and \code{password}.
-#' @param ... for the src, other arguments passed on to the underlying
-#' database connector, \code{dbConnect}. For the tbl, included for
-#' compatibility with the generic, but otherwise ignored.
-#' @param src a mysql src created with \code{src_mysql}.
-#' @param from Either a string giving the name of table in database, or
-#' \code{\link{sql}} described a derived table or compound join.
-#' @export
-#' @examples
-#' \dontrun{
-#' # Connection basics ---------------------------------------------------------
-#' # To connect to a database first create a src:
-#' my_db <- src_mysql(host = "blah.com", user = "hadley",
-#' password = "pass")
-#' # Then reference a tbl within that src
-#' my_tbl <- tbl(my_db, "my_table")
-#' }
-#'
-#' # Here we'll use the Lahman database: to create your own local copy,
-#' # create a local database called "lahman", or tell lahman_mysql() how to
-#' # a database that you can write to
-#'
-#' if (!has_lahman("postgres") && has_lahman("mysql")) {
-#' lahman_m <- lahman_mysql()
-#' # Methods -------------------------------------------------------------------
-#' batting <- tbl(lahman_m, "Batting")
-#' dim(batting)
-#' colnames(batting)
-#' head(batting)
-#'
-#' # Data manipulation verbs ---------------------------------------------------
-#' filter(batting, yearID > 2005, G > 130)
-#' select(batting, playerID:lgID)
-#' arrange(batting, playerID, desc(yearID))
-#' summarise(batting, G = mean(G), n = n())
-#' mutate(batting, rbi2 = 1.0 * R / AB)
-#'
-#' # note that all operations are lazy: they don't do anything until you
-#' # request the data, either by `print()`ing it (which shows the first ten
-#' # rows), by looking at the `head()`, or `collect()` the results locally.
-#'
-#' system.time(recent <- filter(batting, yearID > 2010))
-#' system.time(collect(recent))
-#'
-#' # Group by operations -------------------------------------------------------
-#' # To perform operations by group, create a grouped object with group_by
-#' players <- group_by(batting, playerID)
-#' group_size(players)
-#'
-#' # MySQL doesn't support windowed functions, which means that only
-#' # grouped summaries are really useful:
-#' summarise(players, mean_g = mean(G), best_ab = max(AB))
-#'
-#' # When you group by multiple level, each summarise peels off one level
-#' per_year <- group_by(batting, playerID, yearID)
-#' stints <- summarise(per_year, stints = max(stint))
-#' filter(ungroup(stints), stints > 3)
-#' summarise(stints, max(stints))
-#'
-#' # Joins ---------------------------------------------------------------------
-#' player_info <- select(tbl(lahman_m, "Master"), playerID,
-#' birthYear)
-#' hof <- select(filter(tbl(lahman_m, "HallOfFame"), inducted == "Y"),
-#' playerID, votedBy, category)
-#'
-#' # Match players and their hall of fame data
-#' inner_join(player_info, hof)
-#' # Keep all players, match hof data where available
-#' left_join(player_info, hof)
-#' # Find only players in hof
-#' semi_join(player_info, hof)
-#' # Find players not in hof
-#' anti_join(player_info, hof)
-#'
-#' # Arbitrary SQL -------------------------------------------------------------
-#' # You can also provide sql as is, using the sql function:
-#' batting2008 <- tbl(lahman_m,
-#' sql("SELECT * FROM Batting WHERE YearID = 2008"))
-#' batting2008
-#' }
-src_mysql <- function(dbname, host = NULL, port = 0L, user = "root",
- password = "", ...) {
- if (!requireNamespace("RMySQL", quietly = TRUE)) {
- stop("RMySQL package required to connect to mysql/mariadb", call. = FALSE)
- }
-
- con <- dbConnect(RMySQL::MySQL(), dbname = dbname , host = host, port = port,
- username = user, password = password, ...)
- info <- dbGetInfo(con)
-
- src_sql("mysql", con,
- info = info, disco = db_disconnector(con, "mysql"))
-}
-
-#' @export
-#' @rdname src_mysql
-tbl.src_mysql <- function(src, from, ...) {
- tbl_sql("mysql", src = src, from = from, ...)
-}
-
-#' @export
-src_desc.src_mysql <- function(x) {
- info <- x$info
-
- paste0("mysql ", info$serverVersion, " [", info$user, "@",
- info$host, ":", info$port, "/", info$dbname, "]")
-}
-
-#' @export
-sql_translate_env.MySQLConnection <- function(con) {
- sql_variant(
- base_scalar,
- sql_translator(.parent = base_agg,
- n = function() sql("count(*)"),
- sd = sql_prefix("stddev_samp"),
- var = sql_prefix("var_samp"),
- paste = function(x, collapse) build_sql("group_concat(", x, collapse, ")")
- )
- )
-}
-
-# DBI methods ------------------------------------------------------------------
-
-#' @export
-db_has_table.MySQLConnection <- function(con, table, ...) {
- # MySQL has no way to list temporary tables, so we always NA to
- # skip any local checks and rely on the database to throw informative errors
- NA
-}
-
-#' @export
-db_data_type.MySQLConnection <- function(con, fields, ...) {
- char_type <- function(x) {
- n <- max(nchar(as.character(x), "bytes"))
- if (n <= 65535) {
- paste0("varchar(", n, ")")
- } else {
- "mediumtext"
- }
- }
-
- data_type <- function(x) {
- switch(class(x)[1],
- logical = "boolean",
- integer = "integer",
- numeric = "double",
- factor = char_type(x),
- character = char_type(x),
- Date = "date",
- POSIXct = "datetime",
- stop("Unknown class ", paste(class(x), collapse = "/"), call. = FALSE)
- )
- }
- vapply(fields, data_type, character(1))
-}
-
-#' @export
-db_begin.MySQLConnection <- function(con, ...) {
- dbGetQuery(con, "START TRANSACTION")
-}
-
-#' @export
-db_commit.MySQLConnection <- function(con, ...) {
- dbGetQuery(con, "COMMIT")
-}
-
-#' @export
-db_insert_into.MySQLConnection <- function(con, table, values, ...) {
-
- # Convert factors to strings
- is_factor <- vapply(values, is.factor, logical(1))
- values[is_factor] <- lapply(values[is_factor], as.character)
-
- # Encode special characters in strings
- is_char <- vapply(values, is.character, logical(1))
- values[is_char] <- lapply(values[is_char], encodeString)
-
- tmp <- tempfile(fileext = ".csv")
- utils::write.table(values, tmp, sep = "\t", quote = FALSE, qmethod = "escape",
- row.names = FALSE, col.names = FALSE)
-
- sql <- build_sql("LOAD DATA LOCAL INFILE ", encodeString(tmp), " INTO TABLE ",
- ident(table), con = con)
- dbGetQuery(con, sql)
-
- invisible()
-}
-
-#' @export
-db_create_index.MySQLConnection <- function(con, table, columns, name = NULL,
- unique = FALSE, ...) {
- name <- name %||% paste0(c(table, columns), collapse = "_")
- fields <- escape(ident(columns), parens = TRUE, con = con)
- index <- build_sql(
- "ADD ", if (unique) sql("UNIQUE "), "INDEX ", ident(name), " ", fields,
- con = con)
-
- sql <- build_sql("ALTER TABLE ", ident(table), "\n", index, con = con)
- dbGetQuery(con, sql)
-}
-
-#' @export
-db_analyze.MySQLConnection <- function(con, table, ...) {
- sql <- build_sql("ANALYZE TABLE", ident(table), con = con)
- dbGetQuery(con, sql)
-}
-
-#' @export
-sql_escape_ident.MySQLConnection <- function(con, x) {
- sql_quote(x, "`")
-}
diff --git a/R/src-postgres.r b/R/src-postgres.r
deleted file mode 100644
index 0ec2dcd..0000000
--- a/R/src-postgres.r
+++ /dev/null
@@ -1,197 +0,0 @@
-#' Connect to postgresql.
-#'
-#' Use \code{src_postgres} to connect to an existing postgresql database,
-#' and \code{tbl} to connect to tables within that database.
-#' If you are running a local postgresql database, leave all parameters set as
-#' their defaults to connect. If you're connecting to a remote database,
-#' ask your database administrator for the values of these variables.
-#'
-#' @template db-info
-#' @param dbname Database name
-#' @param host,port Host name and port number of database
-#' @param user,password User name and password (if needed)
-#' @param ... for the src, other arguments passed on to the underlying
-#' database connector, \code{dbConnect}. For the tbl, included for
-#' compatibility with the generic, but otherwise ignored.
-#' @param src a postgres src created with \code{src_postgres}.
-#' @param from Either a string giving the name of table in database, or
-#' \code{\link{sql}} described a derived table or compound join.
-#' @export
-#' @examples
-#' \dontrun{
-#' # Connection basics ---------------------------------------------------------
-#' # To connect to a database first create a src:
-#' my_db <- src_postgres(host = "blah.com", user = "hadley",
-#' password = "pass")
-#' # Then reference a tbl within that src
-#' my_tbl <- tbl(my_db, "my_table")
-#' }
-#'
-#' # Here we'll use the Lahman database: to create your own local copy,
-#' # create a local database called "lahman", or tell lahman_postgres() how to
-#' # access a database that you can write to
-#'
-#' if (has_lahman("postgres")) {
-#' lahman_p <- lahman_postgres()
-#' # Methods -------------------------------------------------------------------
-#' batting <- tbl(lahman_p, "Batting")
-#' dim(batting)
-#' colnames(batting)
-#' head(batting)
-#'
-#' # Data manipulation verbs ---------------------------------------------------
-#' filter(batting, yearID > 2005, G > 130)
-#' select(batting, playerID:lgID)
-#' arrange(batting, playerID, desc(yearID))
-#' summarise(batting, G = mean(G), n = n())
-#' mutate(batting, rbi2 = if(is.null(AB)) 1.0 * R / AB else 0)
-#'
-#' # note that all operations are lazy: they don't do anything until you
-#' # request the data, either by `print()`ing it (which shows the first ten
-#' # rows), by looking at the `head()`, or `collect()` the results locally.
-#'
-#' system.time(recent <- filter(batting, yearID > 2010))
-#' system.time(collect(recent))
-#'
-#' # Group by operations -------------------------------------------------------
-#' # To perform operations by group, create a grouped object with group_by
-#' players <- group_by(batting, playerID)
-#' group_size(players)
-#'
-#' summarise(players, mean_g = mean(G), best_ab = max(AB))
-#' best_year <- filter(players, AB == max(AB) | G == max(G))
-#' best_year
-#'
-#' progress <- mutate(players,
-#' cyear = yearID - min(yearID) + 1,
-#' ab_rank = rank(desc(AB)),
-#' cumulative_ab = order_by(yearID, cumsum(AB)))
-#'
-#' # When you group by multiple level, each summarise peels off one level
-#' per_year <- group_by(batting, playerID, yearID)
-#' stints <- summarise(per_year, stints = max(stint))
-#' filter(stints, stints > 3)
-#' summarise(stints, max(stints))
-#' mutate(stints, order_by(yearID, cumsum(stints)))
-#'
-#' # Joins ---------------------------------------------------------------------
-#' player_info <- select(tbl(lahman_p, "Master"), playerID, birthYear)
-#' hof <- select(filter(tbl(lahman_p, "HallOfFame"), inducted == "Y"),
-#' playerID, votedBy, category)
-#'
-#' # Match players and their hall of fame data
-#' inner_join(player_info, hof)
-#' # Keep all players, match hof data where available
-#' left_join(player_info, hof)
-#' # Find only players in hof
-#' semi_join(player_info, hof)
-#' # Find players not in hof
-#' anti_join(player_info, hof)
-#'
-#' # Arbitrary SQL -------------------------------------------------------------
-#' # You can also provide sql as is, using the sql function:
-#' batting2008 <- tbl(lahman_p,
-#' sql('SELECT * FROM "Batting" WHERE "yearID" = 2008'))
-#' batting2008
-#' }
-src_postgres <- function(dbname = NULL, host = NULL, port = NULL, user = NULL,
- password = NULL, ...) {
- if (!requireNamespace("RPostgreSQL", quietly = TRUE)) {
- stop("RPostgreSQL package required to connect to postgres db", call. = FALSE)
- }
-
- user <- user %||% if (in_travis()) "postgres" else ""
-
- con <- dbConnect(RPostgreSQL::PostgreSQL(), host = host %||% "", dbname = dbname %||% "",
- user = user, password = password %||% "", port = port %||% "", ...)
- info <- dbGetInfo(con)
-
- src_sql("postgres", con,
- info = info, disco = db_disconnector(con, "postgres"))
-}
-
-#' @export
-#' @rdname src_postgres
-tbl.src_postgres <- function(src, from, ...) {
- tbl_sql("postgres", src = src, from = from, ...)
-}
-
-#' @export
-src_desc.src_postgres <- function(x) {
- info <- x$info
- host <- if (info$host == "") "localhost" else info$host
-
- paste0("postgres ", info$serverVersion, " [", info$user, "@",
- host, ":", info$port, "/", info$dbname, "]")
-}
-
-#' @export
-sql_translate_env.PostgreSQLConnection <- function(con) {
- sql_variant(
- base_scalar,
- sql_translator(.parent = base_agg,
- n = function() sql("count(*)"),
- cor = sql_prefix("corr"),
- cov = sql_prefix("covar_samp"),
- sd = sql_prefix("stddev_samp"),
- var = sql_prefix("var_samp"),
- all = sql_prefix("bool_and"),
- any = sql_prefix("bool_or"),
- paste = function(x, collapse) build_sql("string_agg(", x, ", ", collapse, ")")
- ),
- base_win
- )
-}
-
-# DBI methods ------------------------------------------------------------------
-
-# Doesn't return TRUE for temporary tables
-#' @export
-db_has_table.PostgreSQLConnection <- function(con, table, ...) {
- table %in% db_list_tables(con)
-}
-
-#' @export
-db_begin.PostgreSQLConnection <- function(con, ...) {
- dbGetQuery(con, "BEGIN TRANSACTION")
-}
-
-# http://www.postgresql.org/docs/9.3/static/sql-explain.html
-#' @export
-db_explain.PostgreSQLConnection <- function(con, sql, format = "text", ...) {
- format <- match.arg(format, c("text", "json", "yaml", "xml"))
-
- exsql <- build_sql("EXPLAIN ",
- if (!is.null(format)) build_sql("(FORMAT ", sql(format), ") "),
- sql)
- expl <- dbGetQuery(con, exsql)
-
- paste(expl[[1]], collapse = "\n")
-}
-
-#' @export
-db_insert_into.PostgreSQLConnection <- function(con, table, values, ...) {
-
- if (nrow(values) == 0)
- return(NULL)
-
- cols <- lapply(values, escape, collapse = NULL, parens = FALSE, con = con)
- col_mat <- matrix(unlist(cols, use.names = FALSE), nrow = nrow(values))
-
- rows <- apply(col_mat, 1, paste0, collapse = ", ")
- values <- paste0("(", rows, ")", collapse = "\n, ")
-
- sql <- build_sql("INSERT INTO ", ident(table), " VALUES ", sql(values))
- dbGetQuery(con, sql)
-}
-
-#' @export
-db_query_fields.PostgreSQLConnection <- function(con, sql, ...) {
- fields <- build_sql("SELECT * FROM ", sql_subquery(con, sql), " WHERE 0=1",
- con = con)
-
- qry <- dbSendQuery(con, fields)
- on.exit(dbClearResult(qry))
-
- dbGetInfo(qry)$fieldDescription[[1]]$name
-}
diff --git a/R/src-sql.r b/R/src-sql.r
deleted file mode 100644
index c91faf1..0000000
--- a/R/src-sql.r
+++ /dev/null
@@ -1,32 +0,0 @@
-#' Create a "sql src" object
-#'
-#' \code{src_sql} is the standard constructor for all SQL based srcs.
-#'
-#' @keywords internal
-#' @export
-#' @param subclass name of subclass. "src_sql" is an abstract base class, so you
-#' must supply this value. \code{src_} is automatically prepended to the
-#' class name
-#' @param con the connection object
-#' @param ... fields used by object
-src_sql <- function(subclass, con, ...) {
- subclass <- paste0("src_", subclass)
- structure(list(con = con, ...), class = c(subclass, "src_sql", "src"))
-}
-
-#' @export
-same_src.src_sql <- function(x, y) {
- if (!inherits(y, "src_sql")) return(FALSE)
- identical(x$con, y$con)
-}
-
-#' @export
-src_tbls.src_sql <- function(x, ...) {
- db_list_tables(x$con)
-}
-
-#' @export
-format.src_sql <- function(x, ...) {
- paste0("src: ", src_desc(x), "\n",
- wrap("tbls: ", paste0(sort(src_tbls(x)), collapse = ", ")))
-}
diff --git a/R/src-sqlite.r b/R/src-sqlite.r
deleted file mode 100644
index 87005c6..0000000
--- a/R/src-sqlite.r
+++ /dev/null
@@ -1,195 +0,0 @@
-#' Connect to a sqlite database.
-#'
-#' Use \code{src_sqlite} to connect to an existing sqlite database,
-#' and \code{tbl} to connect to tables within that database.
-#' If you are running a local sqliteql database, leave all parameters set as
-#' their defaults to connect. If you're connecting to a remote database,
-#' ask your database administrator for the values of these variables.
-#' \code{\link{src_memdb}} is an easy way to use an in-memory SQLite database
-#' that is scoped to the current session.
-#'
-#' @template db-info
-#' @param path Path to SQLite database
-#' @param create if \code{FALSE}, \code{path} must already exist. If
-#' \code{TRUE}, will create a new SQlite3 database at \code{path} if
-#' \code{path} does not exist and connect to the existing database if
-#' \code{path} does exist.
-#' @param src a sqlite src created with \code{src_sqlite}.
-#' @param from Either a string giving the name of table in database, or
-#' \code{\link{sql}} described a derived table or compound join.
-#' @param ... Included for compatibility with the generic, but otherwise
-#' ignored.
-#' @export
-#' @examples
-#' \dontrun{
-#' # Connection basics ---------------------------------------------------------
-#' # To connect to a database first create a src:
-#' my_db <- src_sqlite(path = tempfile(), create = TRUE)
-#' # Then reference a tbl within that src
-#' my_tbl <- tbl(my_db, "my_table")
-#' }
-#'
-#' # Here we'll use the Lahman database: to create your own local copy,
-#' # run lahman_sqlite()
-#'
-#' \dontrun{
-#' if (requireNamespace("RSQLite") && has_lahman("sqlite")) {
-#' lahman_s <- lahman_sqlite()
-#' # Methods -------------------------------------------------------------------
-#' batting <- tbl(lahman_s, "Batting")
-#' dim(batting)
-#' colnames(batting)
-#' head(batting)
-#'
-#' # Data manipulation verbs ---------------------------------------------------
-#' filter(batting, yearID > 2005, G > 130)
-#' select(batting, playerID:lgID)
-#' arrange(batting, playerID, desc(yearID))
-#' summarise(batting, G = mean(G), n = n())
-#' mutate(batting, rbi2 = 1.0 * R / AB)
-#'
-#' # note that all operations are lazy: they don't do anything until you
-#' # request the data, either by `print()`ing it (which shows the first ten
-#' # rows), by looking at the `head()`, or `collect()` the results locally.
-#'
-#' system.time(recent <- filter(batting, yearID > 2010))
-#' system.time(collect(recent))
-#'
-#' # Group by operations -------------------------------------------------------
-#' # To perform operations by group, create a grouped object with group_by
-#' players <- group_by(batting, playerID)
-#' group_size(players)
-#'
-#' # sqlite doesn't support windowed functions, which means that only
-#' # grouped summaries are really useful:
-#' summarise(players, mean_g = mean(G), best_ab = max(AB))
-#'
-#' # When you group by multiple level, each summarise peels off one level
-#' per_year <- group_by(batting, playerID, yearID)
-#' stints <- summarise(per_year, stints = max(stint))
-#' filter(ungroup(stints), stints > 3)
-#' summarise(stints, max(stints))
-#'
-#' # Joins ---------------------------------------------------------------------
-#' player_info <- select(tbl(lahman_s, "Master"), playerID, birthYear)
-#' hof <- select(filter(tbl(lahman_s, "HallOfFame"), inducted == "Y"),
-#' playerID, votedBy, category)
-#'
-#' # Match players and their hall of fame data
-#' inner_join(player_info, hof)
-#' # Keep all players, match hof data where available
-#' left_join(player_info, hof)
-#' # Find only players in hof
-#' semi_join(player_info, hof)
-#' # Find players not in hof
-#' anti_join(player_info, hof)
-#'
-#' # Arbitrary SQL -------------------------------------------------------------
-#' # You can also provide sql as is, using the sql function:
-#' batting2008 <- tbl(lahman_s,
-#' sql("SELECT * FROM Batting WHERE YearID = 2008"))
-#' batting2008
-#' }
-#' }
-src_sqlite <- function(path, create = FALSE) {
- if (!requireNamespace("RSQLite", quietly = TRUE)) {
- stop("RSQLite package required to connect to sqlite db", call. = FALSE)
- }
-
- if (!create && !file.exists(path)) {
- stop("Path does not exist and create = FALSE", call. = FALSE)
- }
-
- con <- DBI::dbConnect(RSQLite::SQLite(), path)
- RSQLite::initExtension(con)
-
- src_sql("sqlite", con, path = path)
-}
-
-#' Per-session in-memory SQLite databases.
-#'
-#' \code{src_memdb} lets you easily access a sessio-temporary in-memory
-#' SQLite database. \code{memdb_frame()} works like \code{\link{data_frame}},
-#' but instead of creating a new data frame in R, it creates a table in
-#' \code{src_memdb}
-#'
-#' @export
-#' @examples
-#' if (require("RSQLite")) {
-#' src_memdb()
-#'
-#' df <- memdb_frame(x = runif(100), y = runif(100))
-#' df %>% arrange(x)
-#' df %>% arrange(x) %>% show_query()
-#' }
-src_memdb <- function() {
- cache_computation("src_memdb", src_sqlite(":memory:", TRUE))
-}
-
-#' @inheritParams tibble::data_frame
-#' @param .name Name of table in database: defaults to a random name that's
-#' unlikely to conflict with exist
-#' @export
-#' @rdname src_memdb
-memdb_frame <- function(..., .name = random_table_name()) {
- copy_to(src_memdb(), data_frame(...), name = .name)
-}
-
-#' @export
-#' @rdname src_sqlite
-tbl.src_sqlite <- function(src, from, ...) {
- tbl_sql("sqlite", src = src, from = from, ...)
-}
-
-#' @export
-src_desc.src_sqlite <- function(x) {
- paste0("sqlite ", sqlite_version(), " [", x$path, "]")
-}
-
-sqlite_version <- function() {
- if (utils::packageVersion("RSQLite") > 1) {
- RSQLite::rsqliteVersion()[[2]]
- } else {
- DBI::dbGetInfo(RSQLite::SQLite())$clientVersion
- }
-}
-
-#' @export
-sql_translate_env.SQLiteConnection <- function(con) {
- sql_variant(
- sql_translator(.parent = base_scalar,
- log = sql_prefix("log")
- ),
- sql_translator(.parent = base_agg,
- sd = sql_prefix("stdev")
- ),
- base_no_win
- )
-}
-
-#' @export
-sql_escape_ident.SQLiteConnection <- function(con, x) {
- sql_quote(x, '`')
-}
-
-#' @export
-sql_subquery.SQLiteConnection <- function(con, from, name = unique_name(), ...) {
- if (is.ident(from)) {
- setNames(from, name)
- } else {
- if (is.null(name)) {
- build_sql("(", from, ")", con = con)
- } else {
- build_sql("(", from, ") AS ", ident(name), con = con)
- }
- }
-}
-
-
-# DBI methods ------------------------------------------------------------------
-
-#' @export
-db_insert_into.SQLiteConnection <- function(con, table, values, ...) {
- DBI::dbWriteTable(con, table, values, append = TRUE, row.names = FALSE)
-}
-
diff --git a/R/src-test.r b/R/src-test.r
deleted file mode 100644
index 99645a4..0000000
--- a/R/src-test.r
+++ /dev/null
@@ -1,29 +0,0 @@
-#' A set of DBI methods to ease unit testing dplyr with DBI
-#' @name src-test
-#' @param con A database connection.
-#' @param x Object to transform
-#' @param sql A string containing an sql query.
-#' @param ... Other arguments passed on to the individual methods
-NULL
-
-#' @export
-#' @rdname src-test
-db_query_fields.DBITestConnection <- function(con, sql, ...) {
- c("field1")
-}
-
-#' @export
-#' @rdname src-test
-sql_escape_ident.DBITestConnection <- function(con, x) {
- sql_quote(x, '`')
-}
-
-#' @export
-#' @rdname src-test
-sql_translate_env.DBITestConnection <- function(con) {
- dplyr::sql_variant(
- scalar = dplyr::sql_translator(.parent = dplyr::base_scalar),
- aggregate = dplyr::sql_translator(.parent = dplyr::base_agg),
- window = dplyr::sql_translator(.parent = dplyr::base_win)
- )
-}
diff --git a/R/src.r b/R/src.r
index 02d64d0..2814783 100644
--- a/R/src.r
+++ b/R/src.r
@@ -1,17 +1,19 @@
#' Create a "src" object
#'
-#' \code{src} is the standard constructor for srcs and \code{is.src} tests.
+#' `src()` is the standard constructor for srcs and `is.src()` tests.
#'
#' @keywords internal
#' @export
#' @param subclass name of subclass. "src" is an abstract base class, so you
-#' must supply this value. \code{src_} is automatically prepended to the
+#' must supply this value. `src_` is automatically prepended to the
#' class name
-#' @param ... fields used by object
+#' @param ... fields used by object.
+#'
+#' These dots are evaluated with [explicit splicing][rlang::dots_list].
#' @param x object to test for "src"-ness.
src <- function(subclass, ...) {
subclass <- paste0("src_", subclass)
- structure(list(...), class = c(subclass, "src"))
+ structure(dots_list(...), class = c(subclass, "src"))
}
#' @rdname src
@@ -31,6 +33,7 @@ print.src <- function(x, ...) {
#'
#' @param x a data src.
#' @export
+#' @keywords internal
src_tbls <- function(x) {
UseMethod("src_tbls")
}
diff --git a/R/src_dbi.R b/R/src_dbi.R
new file mode 100644
index 0000000..fd5a7c5
--- /dev/null
+++ b/R/src_dbi.R
@@ -0,0 +1,173 @@
+#' Source for database backends
+#'
+#' @description
+#' For backward compatibility dplyr provides three srcs for popular
+#' open source databases:
+#'
+#' * `src_mysql()` connects to a MySQL or MariaDB database using [RMySQL::MySQL()].
+#' * `src_postgres()` connects to PostgreSQL using [RPostgreSQL::PostgreSQL()]
+#' * `src_sqlite()` to connect to a SQLite database using [RSQLite::SQLite()].
+#'
+#' However, modern best practice is to use [tbl()] directly on an `DBIConnection`.
+#'
+#' @details
+#' All data manipulation on SQL tbls are lazy: they will not actually
+#' run the query or retrieve the data unless you ask for it: they all return
+#' a new `tbl_dbi` object. Use [compute()] to run the query and save the
+#' results in a temporary in the database, or use [collect()] to retrieve the
+#' results to R. You can see the query with [show_query()].
+#'
+#' For best performance, the database should have an index on the variables
+#' that you are grouping by. Use [explain()] to check that the database is using
+#' the indexes that you expect.
+#'
+#' There is one exception: [do()] is not lazy since it must pull the data
+#' into R.
+#'
+#' @param dbname Database name
+#' @param host,port Host name and port number of database
+#' @param user,username,password User name and password.
+#'
+#' Generally, you should avoid saving username and password in your
+#' scripts as it is easy to accidentally expose valuable credentials.
+#' Instead, retrieve them from environment variables, or use database
+#' specific credential scores. For example, with MySQL you can set up `my.cnf`
+#' as described in [RMySQL::MySQL()].
+#' @param ... for the src, other arguments passed on to the underlying
+#' database connector, [DBI::dbConnect()]. For the tbl, included for
+#' compatibility with the generic, but otherwise ignored.
+#' @return An S3 object with class `src_dbi`, `src_sql`, `src`.
+#' @examples
+#' # Basic connection using DBI -------------------------------------------
+#' if (require(dbplyr, quietly = TRUE)) {
+#'
+#' con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
+#' copy_to(con, mtcars)
+#'
+#' DBI::dbListTables(con)
+#'
+#' # To retrieve a single table from a source, use `tbl()`
+#' con %>% tbl("mtcars")
+#'
+#' # You can also use pass raw SQL if you want a more sophisticated query
+#' con %>% tbl(sql("SELECT * FROM mtcars WHERE cyl == 8"))
+#'
+#' # To show off the full features of dplyr's database integration,
+#' # we'll use the Lahman database. lahman_sqlite() takes care of
+#' # creating the database.
+#' lahman_p <- lahman_sqlite()
+#' batting <- lahman_p %>% tbl("Batting")
+#' batting
+#'
+#' # Basic data manipulation verbs work in the same way as with a tibble
+#' batting %>% filter(yearID > 2005, G > 130)
+#' batting %>% select(playerID:lgID)
+#' batting %>% arrange(playerID, desc(yearID))
+#' batting %>% summarise(G = mean(G), n = n())
+#'
+#' # There are a few exceptions. For example, databases give integer results
+#' # when dividing one integer by another. Multiply by 1 to fix the problem
+#' batting %>%
+#' select(playerID:lgID, AB, R, G) %>%
+#' mutate(
+#' R_per_game1 = R / G,
+#' R_per_game2 = R * 1.0 / G
+#' )
+#'
+#' # All operations are lazy: they don't do anything until you request the
+#' # data, either by `print()`ing it (which shows the first ten rows),
+#' # or by `collect()`ing the results locally.
+#' system.time(recent <- filter(batting, yearID > 2010))
+#' system.time(collect(recent))
+#'
+#' # You can see the query that dplyr creates with show_query()
+#' batting %>%
+#' filter(G > 0) %>%
+#' group_by(playerID) %>%
+#' summarise(n = n()) %>%
+#' show_query()
+#' }
+#' @name src_dbi
+NULL
+
+#' @rdname src_dbi
+#' @export
+src_mysql <- function(dbname, host = NULL, port = 0L, username = "root",
+ password = "", ...) {
+ check_dbplyr()
+ check_pkg("RMySQL", "connect to MySQL/MariaDB")
+
+ con <- DBI::dbConnect(
+ RMySQL::MySQL(),
+ dbname = dbname,
+ host = host,
+ port = port,
+ username = username,
+ password = password,
+ ...
+ )
+ dbplyr::src_dbi(con, auto_disconnect = TRUE)
+}
+
+#' @rdname src_dbi
+#' @export
+src_postgres <- function(dbname = NULL, host = NULL, port = NULL,
+ user = NULL, password = NULL, ...) {
+ check_dbplyr()
+ check_pkg("RPostgreSQL", "connect to PostgreSQL")
+
+ user <- user %||% if (in_travis()) "postgres" else ""
+
+ con <- DBI::dbConnect(
+ RPostgreSQL::PostgreSQL(),
+ host = host %||% "",
+ dbname = dbname %||% "",
+ user = user,
+ password = password %||% "",
+ port = port %||% "",
+ ...
+ )
+
+ dbplyr::src_dbi(con, auto_disconnect = TRUE)
+}
+
+#' @rdname src_dbi
+#' @export
+#' @param path Path to SQLite database. You can use the special path
+#' ":memory:" to create a temporary in memory database.
+#' @param create if `FALSE`, `path` must already exist. If
+#' `TRUE`, will create a new SQLite3 database at `path` if
+#' `path` does not exist and connect to the existing database if
+#' `path` does exist.
+src_sqlite <- function(path, create = FALSE) {
+ check_dbplyr()
+
+ if (!create && !file.exists(path)) {
+ bad_args("path", "must not already exist, unless `create` = TRUE")
+ }
+
+ con <- DBI::dbConnect(RSQLite::SQLite(), path)
+ RSQLite::initExtension(con)
+
+ dbplyr::src_dbi(con, auto_disconnect = TRUE)
+}
+
+# S3 methods --------------------------------------------------------------
+
+#' @export
+tbl.DBIConnection <- function(src, from, ...) {
+ check_dbplyr()
+ tbl(dbplyr::src_dbi(src), from = from, ...)
+}
+
+#' @export
+copy_to.DBIConnection <- function(dest, df, name = deparse(substitute(df)),
+ overwrite = FALSE, ...) {
+ check_dbplyr()
+ copy_to(dbplyr::src_dbi(dest), df = df, name = name, overwrite = overwrite, ...)
+}
+
+# S4 ----------------------------------------------------------------------
+
+setOldClass(c("sql", "character"))
+setOldClass(c("ident", "sql", "character"))
diff --git a/R/tally.R b/R/tally.R
deleted file mode 100644
index 878deef..0000000
--- a/R/tally.R
+++ /dev/null
@@ -1,91 +0,0 @@
-#' Counts/tally observations by group.
-#'
-#' \code{tally} is a convenient wrapper for summarise that will either call
-#' \code{\link{n}} or \code{\link{sum}(n)} depending on whether you're tallying
-#' for the first time, or re-tallying. \code{count()} is similar, but also
-#' does the \code{\link{group_by}} for you.
-#'
-#' @param x a \code{\link{tbl}} to tally/count.
-#' @param ...,vars Variables to group by.
-#' @param wt (Optional) If omitted, will count the number of rows. If specified,
-#' will perform a "weighted" tally by summing the (non-missing) values of
-#' variable \code{wt}.
-#' @param sort if \code{TRUE} will sort output in descending order of \code{n}
-#' @export
-#' @examples
-#' if (require("Lahman")) {
-#' batting_tbl <- tbl_df(Batting)
-#' tally(group_by(batting_tbl, yearID))
-#' tally(group_by(batting_tbl, yearID), sort = TRUE)
-#'
-#' # Multiple tallys progressively roll up the groups
-#' plays_by_year <- tally(group_by(batting_tbl, playerID, stint), sort = TRUE)
-#' tally(plays_by_year, sort = TRUE)
-#' tally(tally(plays_by_year))
-#'
-#' # This looks a little nicer if you use the infix %>% operator
-#' batting_tbl %>% group_by(playerID) %>% tally(sort = TRUE)
-#'
-#' # count is even more succinct - it also does the grouping for you
-#' batting_tbl %>% count(playerID)
-#' batting_tbl %>% count(playerID, wt = G)
-#' batting_tbl %>% count(playerID, wt = G, sort = TRUE)
-#' }
-tally <- function(x, wt, sort = FALSE) {
- if (missing(wt)) {
- if ("n" %in% names(x)) {
- message("Using n as weighting variable")
- wt <- quote(n)
- } else {
- wt <- NULL
- }
- } else {
- wt <- substitute(wt)
- }
-
- tally_(x, wt, sort = sort)
-}
-
-tally_ <- function(x, wt, sort = FALSE) {
- if (is.null(wt)) {
- n <- quote(n())
- } else {
- n <- lazyeval::interp(quote(sum(wt, na.rm = TRUE)), wt = wt)
- }
-
- n_name <- n_name(tbl_vars(x))
- out <- summarise_(x, .dots = setNames(list(n), n_name))
-
- if (!sort) {
- out
- } else {
- desc_n <- lazyeval::interp(quote(desc(n)), n = as.name(n_name))
- arrange_(out, desc_n)
- }
-}
-
-n_name <- function(x) {
- name <- "n"
- while (name %in% x) {
- name <- paste0(name, "n")
- }
-
- name
-
-}
-
-#' @export
-#' @rdname tally
-count <- function(x, ..., wt = NULL, sort = FALSE) {
- vars <- lazyeval::lazy_dots(...)
- wt <- substitute(wt)
-
- count_(x, vars, wt, sort = sort)
-}
-
-#' @export
-#' @rdname tally
-count_ <- function(x, vars, wt = NULL, sort = FALSE) {
- grouped <- group_by_(x, .dots = vars, add = TRUE)
- tally_(grouped, wt = wt, sort = sort)
-}
diff --git a/R/tbl-cube.r b/R/tbl-cube.r
index cc478d1..bbe0842 100644
--- a/R/tbl-cube.r
+++ b/R/tbl-cube.r
@@ -1,4 +1,4 @@
-#' A data cube tbl.
+#' A data cube tbl
#'
#' A cube tbl stores data in a compact array format where dimension
#' names are not needlessly repeated. They are particularly appropriate for
@@ -7,7 +7,7 @@
#' Compared to data frames, they will occupy much less memory when variables
#' are crossed, not nested.
#'
-#' \code{tbl_cube} support is currently experimental and little performance
+#' `tbl_cube` support is currently experimental and little performance
#' optimisation has been done, but you may find them useful if your data
#' already comes in this form, or you struggle with the memory overhead of the
#' sparse/crossed of data frames. There is no support for hierarchical
@@ -19,29 +19,29 @@
#' Manipulation functions:
#'
#' \itemize{
-#' \item \code{select} (M)
+#' \item `select()` (M)
#'
-#' \item \code{summarise} (M), corresponds to roll-up, but rather more
+#' \item `summarise()` (M), corresponds to roll-up, but rather more
#' limited since there are no hierarchies.
#'
-#' \item \code{filter} (D), corresponds to slice/dice.
+#' \item `filter()` (D), corresponds to slice/dice.
#'
-#' \item \code{mutate} (M) is not implemented, but should be relatively
-#' straightforward given the implementation of \code{summarise}.
+#' \item `mutate()` (M) is not implemented, but should be relatively
+#' straightforward given the implementation of `summarise`.
#'
-#' \item \code{arrange} (D?) Not implemented: not obvious how much sense
+#' \item `arrange()` (D?) Not implemented: not obvious how much sense
#' it would make
#' }
#'
-#' Joins: not implemented. See \code{vignettes/joins.graffle} for ideas.
+#' Joins: not implemented. See `vignettes/joins.graffle` for ideas.
#' Probably straightforward if you get the indexes right, and that's probably
#' some straightforward array/tensor operation.
#'
#' @export
#' @param dimensions A named list of vectors. A dimension is a variable
-#' whose values are known before the experiement is conducted; they are
+#' whose values are known before the experiment is conducted; they are
#' fixed by design (in \pkg{reshape2} they are known as id variables).
-#' \code{tbl_cubes} are dense which means that almost every combination of
+#' `tbl_cubes` are dense which means that almost every combination of
#' the dimensions should have associated measurements: missing values require
#' an explicit NA, so if the variables are nested, not crossed, the
#' majority of the data structure will be empty. Dimensions are typically,
@@ -50,8 +50,8 @@
#' actually measured, and is not known in advance. The dimension of each
#' array should be the same as the length of the dimensions. Measures are
#' typically, but not always, continuous values.
-#' @seealso \code{\link{as.tbl_cube}} for ways of coercing existing data
-#' structures into a \code{tbl_cube}.
+#' @seealso [as.tbl_cube()] for ways of coercing existing data
+#' structures into a `tbl_cube`.
#' @examples
#' # The built in nasa dataset records meterological data (temperature,
#' # cloud cover, ozone etc) for a 4d spatio-temporal dataset (lat, long,
@@ -86,22 +86,29 @@
tbl_cube <- function(dimensions, measures) {
if (!is.list(dimensions) || any_apply(dimensions, Negate(is.atomic)) ||
is.null(names(dimensions))) {
- stop("Dimensions must be a named list of vectors", call. = FALSE)
+ bad_args("dimensions", "must be a named list of vectors, ",
+ "not {type_of(dimensions)}"
+ )
}
if (!is.list(measures) || any_apply(measures, Negate(is.array)) ||
is.null(names(measures))) {
- stop("Measures must be a named list of arrays", call. = FALSE)
+ bad_args("measures", "must be a named list of arrays, ",
+ "not {type_of(measures)}"
+ )
}
# Check measures have correct dimensions
dims <- vapply(dimensions, length, integer(1), USE.NAMES = FALSE)
- dims_ok <- vapply(measures, function(x) identical(unname(dim(x)), dims),
- logical(1))
+ dims_ok <- vapply(
+ measures, function(x) identical(unname(dim(x)), dims),
+ logical(1)
+ )
if (any(!dims_ok)) {
bad <- names(measures)[!dims_ok]
- stop("Measures ", paste0(bad, collapse = ", "), " don't have correct ",
- "dimensions (", paste0(dims, collapse = " x "), ")", call. = FALSE)
+ bad_measures(bad, "needs dimensions {fmt_dims(dims)}, not {bad_dim}",
+ bad_dim = fmt_dims(dim(measures[!dims_ok][[1L]]))
+ )
}
structure(list(dims = dimensions, mets = measures), class = "tbl_cube")
@@ -122,11 +129,12 @@ same_src.tbl_cube <- function(x, y) {
#' @export
print.tbl_cube <- function(x, ...) {
- cat("Source: local array ", dim_desc(x), "\n",
- sep = "")
+ cat("Source: local array ", dim_desc(x), "\n", sep = "")
if (!is.null(x$groups)) {
- cat("Grouped by: ", paste(names(x$dims)[x$groups], collapse = ", "),
- "\n", sep = "")
+ cat(
+ "Grouped by: ", paste(names(x$dims)[x$groups], collapse = ", "), "\n",
+ sep = ""
+ )
}
# Dimensions
@@ -144,11 +152,11 @@ print.tbl_cube <- function(x, ...) {
# Coercion methods (from tbl_cube) ---------------------------------------------
-#' Coerce a \code{tbl_cube} to other data structures
+#' Coerce a `tbl_cube` to other data structures
#'
#' Supports conversion to tables, data frames, tibbles.
#'
-#' @param x a \code{tbl_cube}
+#' @param x a `tbl_cube`
#' @param ... Passed on to individual methods; otherwise ignored.
#' @param measure A measure name or index, default: the first measure
#' @name as.table.tbl_cube
@@ -176,7 +184,7 @@ as.data.frame.tbl_cube <- function(x, ...) {
#' @rdname as.table.tbl_cube
#' @description For a cube, the data frame returned by
-#' \code{\link[tibble]{as_data_frame}} resulting data frame contains the
+#' [tibble::as_data_frame()] resulting data frame contains the
#' dimensions as character values (and not as factors).
#' @export
as_data_frame.tbl_cube <- function(x, ...) {
@@ -185,7 +193,7 @@ as_data_frame.tbl_cube <- function(x, ...) {
# Coercion methods -------------------------------------------------------------
-#' Coerce an existing data structure into a \code{tbl_cube}
+#' Coerce an existing data structure into a `tbl_cube`
#'
#' @param x an object to convert. Built in methods will convert arrays,
#' tables and data frames.
@@ -195,10 +203,11 @@ as.tbl_cube <- function(x, ...) UseMethod("as.tbl_cube")
#' @export
#' @rdname as.tbl_cube
-#' @param dim_names names of the dimesions. Defaults to the names of
+#' @param dim_names names of the dimensions. Defaults to the names of
#' @param met_name a string to use as the name for the measure
-#' the \code{\link{dimnames}}.
-as.tbl_cube.array <- function(x, dim_names = names(dimnames(x)), met_name = deparse(substitute(x)), ...) {
+#' the [dimnames()].
+as.tbl_cube.array <- function(x, dim_names = names(dimnames(x)), met_name = deparse(substitute(x)),
+ ...) {
force(met_name)
dims <- dimnames(x)
@@ -216,7 +225,8 @@ undimname <- function(x) {
#' @export
#' @rdname as.tbl_cube
-as.tbl_cube.table <- function(x, dim_names = names(dimnames(x)), met_name = "Freq", ...) {
+as.tbl_cube.table <- function(x, dim_names = names(dimnames(x)), met_name = "Freq",
+ ...) {
as.tbl_cube.array(unclass(x), dim_names = dim_names, met_name = met_name)
}
@@ -232,13 +242,14 @@ guess_met <- function(df) {
met <- names(df)[is_num]
}
- message("Using ", paste(met, collapse = ", "), " as measure column(s): use met_name to override.")
+ inform(paste0("Using ", paste(met, collapse = ", "), " as measure column(s): use `met_name` to override."))
met
}
#' @export
#' @rdname as.tbl_cube
-as.tbl_cube.data.frame <- function(x, dim_names = NULL, met_name = guess_met(x), ...) {
+as.tbl_cube.data.frame <- function(x, dim_names = NULL, met_name = guess_met(x),
+ ...) {
if (is.null(dim_names)) {
dim_names <- setdiff(names(x), met_name)
} else {
@@ -254,10 +265,6 @@ as.tbl_cube.data.frame <- function(x, dim_names = NULL, met_name = guess_met(x),
met_name <- names(x)[met_name]
}
- if (is.null(dim_names) && is.null(met_name)) {
- stop("At least one of dim_names and met_name must be non-NULL.", call. = FALSE)
- }
-
dims <- lapply(x[dim_names], unique)
n <- vapply(dims, length, integer(1))
@@ -267,8 +274,9 @@ as.tbl_cube.data.frame <- function(x, dim_names = NULL, met_name = guess_met(x),
dupe_row <- anyDuplicated(all[dim_names])
dupe <- unlist(all[dupe_row, dim_names])
- stop("Duplicate combination of dimension variables: ",
- paste(names(dupe), "=", dupe, collapse = ", "), call. = FALSE)
+ bad_args("x", "must be unique in all combinations of dimension variables, ",
+ 'duplicates: {fmt_named(dupe)}'
+ )
}
mets <- lapply(met_name, function(i) array(all[[i]], unname(n)))
@@ -281,32 +289,40 @@ as.tbl_cube.data.frame <- function(x, dim_names = NULL, met_name = guess_met(x),
# Verbs -------------------------------------------------------------------
#' @export
-select_.tbl_cube <- function(.data, ..., .dots) {
- dots <- lazyeval::all_dots(.dots, ...)
- vars <- select_vars_(names(.data$mets), dots)
-
+select.tbl_cube <- function(.data, ...) {
+ vars <- select_vars(names(.data$mets), ...)
.data$mets <- .data$mets[vars]
.data
}
-
#' @export
-rename_.tbl_cube <- function(.data, ..., .dots) {
- dots <- lazyeval::all_dots(.dots, ...)
- vars <- rename_vars_(names(.data$mets), dots)
+select_.tbl_cube <- function(.data, ..., .dots = list()) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
+ select(.data, !!! dots)
+}
+#' @export
+rename.tbl_cube <- function(.data, ...) {
+ vars <- rename_vars(names(.data$mets), !!! quos(...))
.data$mets <- .data$mets[vars]
.data
}
+#' @export
+rename_.tbl_cube <- function(.data, ..., .dots = list()) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
+ rename(.data, !!! dots)
+}
#' @export
-filter_.tbl_cube <- function(.data, ..., .dots) {
- dots <- lazyeval::all_dots(.dots, ...)
-
- idx <- vapply(dots, function(d) find_index_check(d$expr, names(.data$dims)),
- integer(1))
- for(i in seq_along(dots)) {
- sel <- eval(dots[[i]]$expr, .data$dims, dots[[i]]$env)
+filter.tbl_cube <- function(.data, ...) {
+ dots <- quos(...)
+
+ idx <- map2_int(
+ seq_along(dots), dots,
+ function(i, d) find_index_check(i, d, names(.data$dims))
+ )
+ for (i in seq_along(dots)) {
+ sel <- eval_tidy(dots[[i]], .data$dims)
sel <- sel & !is.na(sel)
.data$dims[[idx[i]]] <- .data$dims[[idx[i]]][sel]
@@ -315,11 +331,18 @@ filter_.tbl_cube <- function(.data, ..., .dots) {
.data
}
+#' @export
+filter_.tbl_cube <- function(.data, ..., .dots = list()) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
+ filter(.data, !!! dots)
+}
-find_index_check <- function(x, names) {
- idx <- find_index(x, names)
+find_index_check <- function(i, x, names) {
+ idx <- find_index(f_rhs(x), names)
if (length(idx) != 1) {
- stop(deparse(x), " does not refer to exactly one dimension.", call. = FALSE)
+ bad_calls(x, "must refer to exactly one dimension, ",
+ "not {fmt_obj(names[idx])}"
+ )
}
idx
}
@@ -338,20 +361,27 @@ find_index <- function(x, names) {
}
#' @export
-group_by_.tbl_cube <- function(.data, ..., .dots, add = FALSE) {
- groups <- group_by_prepare(.data, ..., .dots = .dots, add = add)
+group_by.tbl_cube <- function(.data, ..., add = FALSE) {
+ groups <- group_by_prepare(.data, ..., add = add)
# Convert symbols to indices
- nms <- names(groups$data$dims)
- nms_list <- as.list(setNames(seq_along(nms), nms))
-
- groups$data$groups <- unlist(lapply(groups$groups, eval, nms_list))
+ groups$data$groups <- match(groups$group_names, names(groups$data$dims))
groups$data
}
+#' @export
+group_by_.tbl_cube <- function(.data, ..., .dots = list(), add = FALSE) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
+ group_by(.data, !!! dots, add = add)
+}
#' @export
groups.tbl_cube <- function(x) {
- lapply(x$dims, as.name)[x$group]
+ lapply(group_vars(x), as.name)
+}
+
+#' @export
+group_vars.tbl_cube <- function(x) {
+ names(x$dims[x$groups])
}
# mutate and summarise operate similarly need to evaluate variables in special
@@ -359,34 +389,41 @@ groups.tbl_cube <- function(x) {
# for better performance
#' @export
-summarise_.tbl_cube <- function(.data, ..., .dots) {
- dots <- lazyeval::all_dots(.dots, ..., all_named = TRUE)
+summarise.tbl_cube <- function(.data, ...) {
+ dots <- named_quos(...)
out_dims <- .data$dims[.data$groups]
- n <- vapply(out_dims, length, integer(1))
+ n <- map_int(out_dims, length)
out_mets <- list()
for (nm in names(dots)) {
out_mets[[nm]] <- array(logical(), n)
}
- slices <- expand.grid(lapply(out_dims, seq_along), KEEP.OUT.ATTRS = FALSE)
+ slices <- expand.grid(map(out_dims, seq_along), KEEP.OUT.ATTRS = FALSE)
# Loop over each group
for (i in seq_len(nrow(slices))) {
- index <- as.list(slices[i, , drop = FALSE])
- mets <- lapply(.data$mets, subs_index, i = .data$groups, val = index,
- drop = TRUE)
+ index <- as_list(slices[i, , drop = FALSE])
+ mets <- map(
+ .data$mets, subs_index, i = .data$groups, val = index,
+ drop = TRUE
+ )
# Loop over each expression
for (j in seq_along(dots)) {
- res <- eval(dots[[j]]$expr, mets, dots[[j]]$env)
+ res <- eval_tidy(dots[[j]], mets)
out_mets[[j]][i] <- res
}
}
structure(list(dims = out_dims, mets = out_mets), class = "tbl_cube")
}
+#' @export
+summarise_.tbl_cube <- function(.data, ..., .dots = list()) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
+ summarise(.data, !!! dots)
+}
subs_index <- function(x, i, val, drop = FALSE) {
dims <- length(dim(x) %||% 1)
@@ -396,20 +433,23 @@ subs_index <- function(x, i, val, drop = FALSE) {
if (length(i) == 1 && is.atomic(val)) {
args[[i]] <- quote(val)
} else if (length(i) >= 1 && is.list(val)) {
- exprs <- lapply(seq_along(i), function(i) as.call(c(quote(`[[`), quote(val), i)))
+ exprs <- lapply(
+ seq_along(i),
+ function(i) as.call(c(quote(`[[`), quote(val), i))
+ )
args[i] <- exprs
} else {
- stop("Invalid input", call. = FALSE)
+ abort("Invalid input")
}
args$drop <- drop
call <- as.call(c(quote(`[`), quote(x), args))
- eval(call)
+ eval_bare(call)
}
#' @export
auto_copy.tbl_cube <- function(x, y, copy = FALSE, ...) {
- stop("Copying not supported by tbl_cube", call. = FALSE)
+ abort("Copying not supported by tbl_cube")
}
diff --git a/R/tbl-df.r b/R/tbl-df.r
index 5651d0d..717a7ce 100644
--- a/R/tbl-df.r
+++ b/R/tbl-df.r
@@ -1,61 +1,10 @@
#' Create a data frame tbl.
#'
-#' Forwards the argument to \code{\link[tibble]{as_data_frame}}, see
-#' \link{tibble-package} for more details.
+#' Deprecated: please use [tibble::as_tibble()] instead.
#'
#' @export
+#' @keywords internal
#' @param data a data frame
-#' @examples
-#' ds <- tbl_df(mtcars)
-#' ds
-#' as.data.frame(ds)
-#'
-#' if (require("Lahman") && packageVersion("Lahman") >= "3.0.1") {
-#' batting <- tbl_df(Batting)
-#' dim(batting)
-#' colnames(batting)
-#' head(batting)
-#'
-#' # Data manipulation verbs ---------------------------------------------------
-#' filter(batting, yearID > 2005, G > 130)
-#' select(batting, playerID:lgID)
-#' arrange(batting, playerID, desc(yearID))
-#' summarise(batting, G = mean(G), n = n())
-#' mutate(batting, rbi2 = if(is.null(AB)) 1.0 * R / AB else 0)
-#'
-#' # Group by operations -------------------------------------------------------
-#' # To perform operations by group, create a grouped object with group_by
-#' players <- group_by(batting, playerID)
-#' head(group_size(players), 100)
-#'
-#' summarise(players, mean_g = mean(G), best_ab = max(AB))
-#' best_year <- filter(players, AB == max(AB) | G == max(G))
-#' progress <- mutate(players, cyear = yearID - min(yearID) + 1,
-#' rank(desc(AB)), cumsum(AB))
-#'
-#' # When you group by multiple level, each summarise peels off one level
-#' \donttest{
-#' per_year <- group_by(batting, playerID, yearID)
-#' stints <- summarise(per_year, stints = max(stint))
-#' filter(stints, stints > 3)
-#' summarise(stints, max(stints))
-#' mutate(stints, cumsum(stints))
-#' }
-#'
-#' # Joins ---------------------------------------------------------------------
-#' player_info <- select(tbl_df(Master), playerID, birthYear)
-#' hof <- select(filter(tbl_df(HallOfFame), inducted == "Y"),
-#' playerID, votedBy, category)
-#'
-#' # Match players and their hall of fame data
-#' inner_join(player_info, hof)
-#' # Keep all players, match hof data where available
-#' left_join(player_info, hof)
-#' # Find only players in hof
-#' semi_join(player_info, hof)
-#' # Find players not in hof
-#' anti_join(player_info, hof)
-#' }
tbl_df <- function(data) {
as_data_frame(data)
}
@@ -92,50 +41,84 @@ as.data.frame.tbl_df <- function(x, row.names = NULL, optional = FALSE, ...) {
# Verbs ------------------------------------------------------------------------
#' @export
-arrange_.tbl_df <- function(.data, ..., .dots) {
- dots <- lazyeval::all_dots(.dots, ..., all_named = TRUE)
+arrange.tbl_df <- function(.data, ...) {
+ dots <- quos(...)
+ arrange_impl(.data, dots)
+}
+#' @export
+arrange_.tbl_df <- function(.data, ..., .dots = list()) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
arrange_impl(.data, dots)
}
#' @export
-filter_.tbl_df <- function(.data, ..., .dots) {
- dots <- lazyeval::all_dots(.dots, ...)
- if (any(has_names(dots))) {
- stop("filter() takes unnamed arguments. Do you need `==`?", call. = FALSE)
+filter.tbl_df <- function(.data, ...) {
+ dots <- quos(...)
+ if (any(have_name(dots))) {
+ bad <- dots[have_name(dots)]
+ bad_eq_ops(bad, "must not be named, do you need `==`?")
+ } else if (is_empty(dots)) {
+ return(.data)
}
- # C++ code assumes that elements are named, so give them automatic names
- dots <- lazyeval::auto_name(dots)
- filter_impl(.data, dots)
+ quo <- all_exprs(!!! dots, .vectorised = TRUE)
+ filter_impl(.data, quo)
+}
+#' @export
+filter_.tbl_df <- function(.data, ..., .dots = list()) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
+ filter(.data, !!! dots)
}
#' @export
-slice_.tbl_df <- function(.data, ..., .dots) {
- dots <- lazyeval::all_dots(.dots, ..., all_named = TRUE)
+slice.tbl_df <- function(.data, ...) {
+ dots <- named_quos(...)
+ slice_impl(.data, dots)
+}
+#' @export
+slice_.tbl_df <- function(.data, ..., .dots = list()) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ..., .named = TRUE)
slice_impl(.data, dots)
}
#' @export
-mutate_.tbl_df <- function(.data, ..., .dots) {
- dots <- lazyeval::all_dots(.dots, ..., all_named = TRUE)
+mutate.tbl_df <- function(.data, ...) {
+ dots <- named_quos(...)
+ mutate_impl(.data, dots)
+}
+#' @export
+mutate_.tbl_df <- function(.data, ..., .dots = list()) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ..., .named = TRUE)
mutate_impl(.data, dots)
}
#' @export
-summarise_.tbl_df <- function(.data, ..., .dots) {
- dots <- lazyeval::all_dots(.dots, ..., all_named = TRUE)
+summarise.tbl_df <- function(.data, ...) {
+ dots <- named_quos(...)
+ summarise_impl(.data, dots)
+}
+#' @export
+summarise_.tbl_df <- function(.data, ..., .dots = list()) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ..., .named = TRUE)
summarise_impl(.data, dots)
}
# Joins ------------------------------------------------------------------------
-#' Join data frame tbls.
+#' Join data frame tbls
#'
-#' See \code{\link{join}} for a description of the general purpose of the
+#' See [join] for a description of the general purpose of the
#' functions.
#'
#' @inheritParams inner_join
#' @param ... included for compatibility with the generic; otherwise ignored.
+#' @param na_matches
+#' Use `"never"` to always treat two `NA` or `NaN` values as
+#' different, like joins for database sources, similarly to
+#' `merge(incomparables = FALSE)`.
+#' The default,`"na"`, always treats two `NA` or `NaN` values as equal, like [merge()].
+#' Users and package authors can change the default behavior by calling
+#' `pkgconfig::set_config("dplyr::na_matches" = "never")`.
#' @examples
#' if (require("Lahman")) {
#' batting_df <- tbl_df(Batting)
@@ -158,99 +141,88 @@ summarise_.tbl_df <- function(.data, ..., .dots) {
#' @name join.tbl_df
NULL
+check_na_matches <- function(na_matches) {
+ na_matches <- match.arg(na_matches, choices = c("na", "never"))
+ accept_na_match <- (na_matches == "na")
+ accept_na_match
+}
+
#' @export
#' @rdname join.tbl_df
inner_join.tbl_df <- function(x, y, by = NULL, copy = FALSE,
- suffix = c(".x", ".y"), ...) {
+ suffix = c(".x", ".y"), ...,
+ na_matches = pkgconfig::get_config("dplyr::na_matches")) {
by <- common_by(by, x, y)
suffix <- check_suffix(suffix)
y <- auto_copy(x, y, copy = copy)
- inner_join_impl(x, y, by$x, by$y, suffix$x, suffix$y)
+ inner_join_impl(x, y, by$x, by$y, suffix$x, suffix$y, check_na_matches(na_matches))
}
#' @export
#' @rdname join.tbl_df
left_join.tbl_df <- function(x, y, by = NULL, copy = FALSE,
- suffix = c(".x", ".y"), ...) {
+ suffix = c(".x", ".y"), ...,
+ na_matches = pkgconfig::get_config("dplyr::na_matches")) {
by <- common_by(by, x, y)
suffix <- check_suffix(suffix)
y <- auto_copy(x, y, copy = copy)
- left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y)
+ left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y, check_na_matches(na_matches))
}
#' @export
#' @rdname join.tbl_df
right_join.tbl_df <- function(x, y, by = NULL, copy = FALSE,
- suffix = c(".x", ".y"), ...) {
+ suffix = c(".x", ".y"), ...,
+ na_matches = pkgconfig::get_config("dplyr::na_matches")) {
by <- common_by(by, x, y)
suffix <- check_suffix(suffix)
y <- auto_copy(x, y, copy = copy)
- right_join_impl(x, y, by$x, by$y, suffix$x, suffix$y)
+ right_join_impl(x, y, by$x, by$y, suffix$x, suffix$y, check_na_matches(na_matches))
}
#' @export
#' @rdname join.tbl_df
full_join.tbl_df <- function(x, y, by = NULL, copy = FALSE,
- suffix = c(".x", ".y"), ...) {
+ suffix = c(".x", ".y"), ...,
+ na_matches = pkgconfig::get_config("dplyr::na_matches")) {
by <- common_by(by, x, y)
suffix <- check_suffix(suffix)
y <- auto_copy(x, y, copy = copy)
- full_join_impl(x, y, by$x, by$y, suffix$x, suffix$y)
+ full_join_impl(x, y, by$x, by$y, suffix$x, suffix$y, check_na_matches(na_matches))
}
#' @export
#' @rdname join.tbl_df
-semi_join.tbl_df <- function(x, y, by = NULL, copy = FALSE, ...) {
+semi_join.tbl_df <- function(x, y, by = NULL, copy = FALSE, ...,
+ na_matches = pkgconfig::get_config("dplyr::na_matches")) {
by <- common_by(by, x, y)
y <- auto_copy(x, y, copy = copy)
- semi_join_impl(x, y, by$x, by$y)
+ semi_join_impl(x, y, by$x, by$y, check_na_matches(na_matches))
}
#' @export
#' @rdname join.tbl_df
-anti_join.tbl_df <- function(x, y, by = NULL, copy = FALSE, ...) {
+anti_join.tbl_df <- function(x, y, by = NULL, copy = FALSE, ...,
+ na_matches = pkgconfig::get_config("dplyr::na_matches")) {
by <- common_by(by, x, y)
y <- auto_copy(x, y, copy = copy)
- anti_join_impl(x, y, by$x, by$y)
+ anti_join_impl(x, y, by$x, by$y, check_na_matches(na_matches))
}
# Set operations ---------------------------------------------------------------
#' @export
-distinct_.tbl_df <- function(.data, ..., .dots) {
+distinct.tbl_df <- function(.data, ...) {
tbl_df(NextMethod())
}
-
-
-# Other methods that currently don't have a better home -----------------------
-
-order_ <- function(..., data){
- parent_frame <- parent.frame()
- if(missing(data)) {
- env <- parent_frame
- } else {
- env <- as.environment(data)
- parent.env(env) <- parent_frame
- }
- order_impl(dots(...) , env)
-}
-
-equal_ <- function(x, y){
- equal_data_frame(x, y)
-}
-
-all_equal_ <- function(...){
- env <- parent.frame()
- all_equal_data_frame(dots(...), env)
-}
-
-sort_ <- function(data){
- sort_impl(data)
+#' @export
+distinct_.tbl_df <- function(.data, ..., .dots = list()) {
+ tbl_df(NextMethod())
}
diff --git a/R/tbl-lazy.R b/R/tbl-lazy.R
deleted file mode 100644
index a443a42..0000000
--- a/R/tbl-lazy.R
+++ /dev/null
@@ -1,130 +0,0 @@
-tbl_lazy <- function(df) {
- make_tbl("lazy", ops = op_base_local(df, env = parent.frame()))
-}
-
-lazy_frame <- function(...) {
- tbl_lazy(data_frame(...))
-}
-
-#' @export
-same_src.tbl_lazy <- function(x, y) {
- inherits(y, "tbl_lazy")
-}
-
-#' @export
-tbl_vars.tbl_lazy <- function(x) {
- op_vars(x$ops)
-}
-
-#' @export
-groups.tbl_lazy <- function(x) {
- lapply(op_grps(x$ops), as.name)
-}
-
-#' @export
-print.tbl_lazy <- function(x, ...) {
- cat("Source: lazy\n")
- cat("Vars : ", commas(op_vars(x$ops)), "\n", sep = "")
- cat("Groups: ", commas(op_grps(x$ops)), "\n", sep = "")
- cat("\n")
-
- print(x$ops)
-}
-
-# Single table methods ----------------------------------------------------
-
-#' @export
-filter_.tbl_lazy <- function(.data, ..., .dots) {
- dots <- lazyeval::all_dots(.dots, ...)
- add_op_single("filter", .data, dots = dots)
-}
-
-#' @export
-arrange_.tbl_lazy <- function(.data, ..., .dots) {
- dots <- lazyeval::all_dots(.dots, ...)
- add_op_single("arrange", .data, dots = dots)
-}
-
-#' @export
-select_.tbl_lazy <- function(.data, ..., .dots) {
- dots <- lazyeval::all_dots(.dots, ...)
- add_op_single("select", .data, dots = dots)
-}
-
-#' @export
-rename_.tbl_lazy <- function(.data, ..., .dots) {
- dots <- lazyeval::all_dots(.dots, ...)
- add_op_single("rename", .data, dots = dots)
-}
-
-#' @export
-summarise_.tbl_lazy <- function(.data, ..., .dots) {
- dots <- lazyeval::all_dots(.dots, ...)
- add_op_single("summarise", .data, dots = dots)
-}
-
-#' @export
-mutate_.tbl_lazy <- function(.data, ..., .dots) {
- dots <- lazyeval::all_dots(.dots, ..., all_named = TRUE)
- add_op_single("mutate", .data, dots = dots)
-}
-
-#' @export
-group_by_.tbl_lazy <- function(.data, ..., .dots, add = TRUE) {
- dots <- lazyeval::all_dots(.dots, ..., all_named = TRUE)
- add_op_single("group_by", .data, dots = dots, args = list(add = add))
-}
-
-#' @export
-head.tbl_lazy <- function(x, n = 6L, ...) {
- add_op_single("head", x, args = list(n = n))
-}
-
-#' @export
-ungroup.tbl_lazy <- function(x, ...) {
- add_op_single("ungroup", x)
-}
-
-#' @export
-distinct_.tbl_lazy <- function(.data, ..., .dots, .keep_all = FALSE) {
- dots <- lazyeval::all_dots(.dots, ..., all_named = TRUE)
- add_op_single("distinct", .data, dots = dots, args = list(.keep_all = .keep_all))
-}
-
-
-# Dual table verbs ------------------------------------------------------------
-
-add_op_join <- function(x, y, type, by = NULL, copy = FALSE,
- suffix = c(".x", ".y"),
- auto_index = FALSE, ...) {
- by <- common_by(by, x, y)
- y <- auto_copy(x, y, copy, indexes = if (auto_index) list(by$y))
-
- x$ops <- op_double("join", x, y, args = list(
- type = type,
- by = by,
- suffix = suffix
- ))
- x
-}
-
-add_op_semi_join <- function(x, y, anti = FALSE, by = NULL, copy = FALSE,
- auto_index = FALSE, ...) {
- by <- common_by(by, x, y)
- y <- auto_copy(x, y, copy, indexes = if (auto_index) list(by$y))
-
- x$ops <- op_double("semi_join", x, y, args = list(
- anti = anti,
- by = by
- ))
- x
-}
-
-add_op_set_op <- function(x, y, type, copy = FALSE, ...) {
- y <- auto_copy(x, y, copy)
- x$ops <- op_double("set_op", x, y, args = list(type = type))
- x
-}
-
-# Currently the dual table verbs are defined on tbl_sql, because the
-# because they definitions are bit too tightly connected to SQL.
diff --git a/R/tbl-sql.r b/R/tbl-sql.r
deleted file mode 100644
index 7501e2f..0000000
--- a/R/tbl-sql.r
+++ /dev/null
@@ -1,481 +0,0 @@
-#' Create an SQL tbl (abstract)
-#'
-#' This method shouldn't be called by users - it should only be used by
-#' backend implementors who are creating backends that extend the basic
-#' sql behaviour.
-#'
-#' @keywords internal
-#' @export
-#' @param subclass name of subclass
-#' @param ... needed for agreement with generic. Not otherwise used.
-#' @param vars If known, the names of the variables in the tbl. This is
-#' relatively expensive to determine automatically, so is cached throughout
-#' dplyr. However, you should usually be able to leave this blank and it
-#' will be determined from the context.
-tbl_sql <- function(subclass, src, from, ..., vars = attr(from, "vars")) {
- make_tbl(
- c(subclass, "sql", "lazy"),
- src = src,
- ops = op_base_remote(src, from, vars)
- )
-}
-
-#' @export
-same_src.tbl_sql <- function(x, y) {
- if (!inherits(y, "tbl_sql")) return(FALSE)
- same_src(x$src, y$src)
-}
-
-# Grouping methods -------------------------------------------------------------
-
-#' @export
-group_size.tbl_sql <- function(x) {
- df <- x %>%
- summarise(n = n()) %>%
- collect()
- df$n
-}
-
-#' @export
-n_groups.tbl_sql <- function(x) {
- if (length(groups(x)) == 0) return(1L)
-
- df <- x %>%
- summarise(x) %>%
- ungroup() %>%
- summarise(n = n()) %>%
- collect()
- df$n
-}
-
-# Standard data frame methods --------------------------------------------------
-
-#' @export
-as.data.frame.tbl_sql <- function(x, row.names = NULL, optional = NULL,
- ..., n = Inf) {
- as.data.frame(collect(x, n = n))
-}
-
-#' @export
-print.tbl_sql <- function(x, ..., n = NULL, width = NULL) {
- cat("Source: query ", dim_desc(x), "\n", sep = "")
- cat("Database: ", src_desc(x$src), "\n", sep = "")
-
- grps <- op_grps(x$ops)
- if (length(grps) > 0) {
- cat("Groups: ", commas(op_grps(x$ops)), "\n", sep = "")
- }
-
- cat("\n")
-
- print(trunc_mat(x, n = n, width = width))
- invisible(x)
-}
-
-#' @export
-dimnames.tbl_sql <- function(x) {
- list(NULL, op_vars(x$ops))
-}
-
-#' @export
-dim.tbl_sql <- function(x) {
- c(NA, length(op_vars(x$ops)))
-}
-
-#' @export
-tail.tbl_sql <- function(x, n = 6L, ...) {
- stop("tail() is not supported by sql sources", call. = FALSE)
-}
-
-# Joins ------------------------------------------------------------------------
-
-#' Join sql tbls.
-#'
-#' See \code{\link{join}} for a description of the general purpose of the
-#' functions.
-#'
-#' @section Implementation notes:
-#'
-#' Semi-joins are implemented using \code{WHERE EXISTS}, and anti-joins with
-#' \code{WHERE NOT EXISTS}. Support for semi-joins is somewhat partial: you
-#' can only create semi joins where the \code{x} and \code{y} columns are
-#' compared with \code{=} not with more general operators.
-#'
-#' @inheritParams join
-#' @param copy If \code{x} and \code{y} are not from the same data source,
-#' and \code{copy} is \code{TRUE}, then \code{y} will be copied into a
-#' temporary table in same database as \code{x}. \code{join} will automatically
-#' run \code{ANALYZE} on the created table in the hope that this will make
-#' you queries as efficient as possible by giving more data to the query
-#' planner.
-#'
-#' This allows you to join tables across srcs, but it's potentially expensive
-#' operation so you must opt into it.
-#' @param auto_index if \code{copy} is \code{TRUE}, automatically create
-#' indices for the variables in \code{by}. This may speed up the join if
-#' there are matching indexes in \code{x}.
-#' @examples
-#' \dontrun{
-#' if (require("RSQLite") && has_lahman("sqlite")) {
-#'
-#' # Left joins ----------------------------------------------------------------
-#' lahman_s <- lahman_sqlite()
-#' batting <- tbl(lahman_s, "Batting")
-#' team_info <- select(tbl(lahman_s, "Teams"), yearID, lgID, teamID, G, R:H)
-#'
-#' # Combine player and whole team statistics
-#' first_stint <- select(filter(batting, stint == 1), playerID:H)
-#' both <- left_join(first_stint, team_info, type = "inner", by = c("yearID", "teamID", "lgID"))
-#' head(both)
-#' explain(both)
-#'
-#' # Join with a local data frame
-#' grid <- expand.grid(
-#' teamID = c("WAS", "ATL", "PHI", "NYA"),
-#' yearID = 2010:2012)
-#' top4a <- left_join(batting, grid, copy = TRUE)
-#' explain(top4a)
-#'
-#' # Indices don't really help here because there's no matching index on
-#' # batting
-#' top4b <- left_join(batting, grid, copy = TRUE, auto_index = TRUE)
-#' explain(top4b)
-#'
-#' # Semi-joins ----------------------------------------------------------------
-#'
-#' people <- tbl(lahman_s, "Master")
-#'
-#' # All people in half of fame
-#' hof <- tbl(lahman_s, "HallOfFame")
-#' semi_join(people, hof)
-#'
-#' # All people not in the hall of fame
-#' anti_join(people, hof)
-#'
-#' # Find all managers
-#' manager <- tbl(lahman_s, "Managers")
-#' semi_join(people, manager)
-#'
-#' # Find all managers in hall of fame
-#' famous_manager <- semi_join(semi_join(people, manager), hof)
-#' famous_manager
-#' explain(famous_manager)
-#'
-#' # Anti-joins ----------------------------------------------------------------
-#'
-#' # batters without person covariates
-#' anti_join(batting, people)
-#' }
-#' }
-#' @name join.tbl_sql
-NULL
-
-#' @rdname join.tbl_sql
-#' @export
-inner_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE,
- suffix = c(".x", ".y"),
- auto_index = FALSE, ...) {
- add_op_join(
- x, y,
- "inner",
- by = by,
- copy = copy,
- suffix = suffix,
- auto_index = auto_index,
- ...
- )
-}
-
-#' @rdname join.tbl_sql
-#' @export
-left_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE,
- suffix = c(".x", ".y"),
- auto_index = FALSE, ...) {
- add_op_join(
- x, y,
- "left",
- by = by,
- copy = copy,
- suffix = suffix,
- auto_index = auto_index,
- ...
- )
-}
-
-#' @rdname join.tbl_sql
-#' @export
-right_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE,
- suffix = c(".x", ".y"),
- auto_index = FALSE, ...) {
- add_op_join(
- x, y,
- "right",
- by = by,
- copy = copy,
- suffix = suffix,
- auto_index = auto_index,
- ...
- )
-}
-
-#' @rdname join.tbl_sql
-#' @export
-full_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE,
- suffix = c(".x", ".y"),
- auto_index = FALSE, ...) {
- add_op_join(
- x, y,
- "full",
- by = by,
- copy = copy,
- suffix = suffix,
- auto_index = auto_index,
- ...
- )
-}
-
-#' @rdname join.tbl_sql
-#' @export
-semi_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE,
- auto_index = FALSE, ...) {
- add_op_semi_join(
- x, y,
- anti = FALSE,
- by = by,
- copy = copy,
- auto_index = auto_index,
- ...
- )
-}
-
-#' @rdname join.tbl_sql
-#' @export
-anti_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE,
- auto_index = FALSE, ...) {
- add_op_semi_join(
- x, y,
- anti = TRUE,
- by = by,
- copy = copy,
- auto_index = auto_index,
- ...
- )
-}
-
-# Set operations ---------------------------------------------------------------
-
-#' @export
-intersect.tbl_lazy <- function(x, y, copy = FALSE, ...) {
- add_op_set_op(x, y, "INTERSECT", copy = copy, ...)
-}
-#' @export
-union.tbl_lazy <- function(x, y, copy = FALSE, ...) {
- add_op_set_op(x, y, "UNION", copy = copy, ...)
-}
-#' @export
-union_all.tbl_lazy <- function(x, y, copy = FALSE, ...) {
- add_op_set_op(x, y, "UNION ALL", copy = copy, ...)
-}
-#' @export
-setdiff.tbl_lazy <- function(x, y, copy = FALSE, ...) {
- add_op_set_op(x, y, "EXCEPT", copy = copy, ...)
-}
-
-# Copying ----------------------------------------------------------------------
-
-#' @export
-auto_copy.tbl_sql <- function(x, y, copy = FALSE, ...) {
- copy_to(x$src, as.data.frame(y), random_table_name(), ...)
-}
-
-#' Copy a local data frame to a sqlite src.
-#'
-#' This standard method works for all sql sources.
-#'
-#' @export
-#' @param types a character vector giving variable types to use for the columns.
-#' See \url{http://www.sqlite.org/datatype3.html} for available types.
-#' @param temporary if \code{TRUE}, will create a temporary table that is
-#' local to this connection and will be automatically deleted when the
-#' connection expires
-#' @param unique_indexes a list of character vectors. Each element of the list
-#' will create a new unique index over the specified column(s). Duplicate rows
-#' will result in failure.
-#' @param indexes a list of character vectors. Each element of the list
-#' will create a new index.
-#' @param analyze if \code{TRUE} (the default), will automatically ANALYZE the
-#' new table so that the query optimiser has useful information.
-#' @inheritParams copy_to
-#' @return a sqlite \code{\link{tbl}} object
-#' @examples
-#' if (requireNamespace("RSQLite")) {
-#' db <- src_sqlite(tempfile(), create = TRUE)
-#'
-#' iris2 <- copy_to(db, iris)
-#' mtcars$model <- rownames(mtcars)
-#' mtcars2 <- copy_to(db, mtcars, indexes = list("model"))
-#'
-#' explain(filter(mtcars2, model == "Hornet 4 Drive"))
-#'
-#' # Note that tables are temporary by default, so they're not
-#' # visible from other connections to the same database.
-#' src_tbls(db)
-#' db2 <- src_sqlite(db$path)
-#' src_tbls(db2)
-#' }
-copy_to.src_sql <- function(dest, df, name = deparse(substitute(df)),
- types = NULL, temporary = TRUE,
- unique_indexes = NULL, indexes = NULL,
- analyze = TRUE, ...) {
- assert_that(is.data.frame(df), is.string(name), is.flag(temporary))
- class(df) <- "data.frame" # avoid S4 dispatch problem in dbSendPreparedQuery
-
- if (isTRUE(db_has_table(dest$con, name))) {
- stop("Table ", name, " already exists.", call. = FALSE)
- }
-
- types <- types %||% db_data_type(dest$con, df)
- names(types) <- names(df)
-
- con <- dest$con
- db_begin(con)
- on.exit(db_rollback(con))
-
- db_create_table(con, name, types, temporary = temporary)
- db_insert_into(con, name, df)
- db_create_indexes(con, name, unique_indexes, unique = TRUE)
- db_create_indexes(con, name, indexes, unique = FALSE)
- if (analyze) db_analyze(con, name)
-
- db_commit(con)
- on.exit(NULL)
-
- tbl(dest, name)
-}
-
-#' @export
-collapse.tbl_sql <- function(x, vars = NULL, ...) {
- sql <- sql_render(x)
- tbl(x$src, sql) %>% group_by_(.dots = groups(x))
-}
-
-#' @export
-#' @rdname compute
-compute.tbl_sql <- function(x, name = random_table_name(), temporary = TRUE,
- unique_indexes = list(), indexes = list(),
- ...) {
- if (!is.list(indexes)) {
- indexes <- as.list(indexes)
- }
- if (!is.list(unique_indexes)) {
- unique_indexes <- as.list(unique_indexes)
- }
-
- vars <- op_vars(x)
- assert_that(all(unlist(indexes) %in% vars))
- assert_that(all(unlist(unique_indexes) %in% vars))
- x_aliased <- select_(x, .dots = vars) # avoids problems with SQLite quoting (#1754)
- db_save_query(x$src$con, sql_render(x_aliased), name = name, temporary = temporary)
- db_create_indexes(x$src$con, name, unique_indexes, unique = TRUE)
- db_create_indexes(x$src$con, name, indexes, unique = FALSE)
-
- tbl(x$src, name) %>% group_by_(.dots = groups(x))
-}
-
-#' @export
-collect.tbl_sql <- function(x, ..., n = 1e5, warn_incomplete = TRUE) {
- assert_that(length(n) == 1, n > 0L)
- if (n == Inf) {
- n <- -1
- }
-
- sql <- sql_render(x)
- res <- dbSendQuery(x$src$con, sql)
- on.exit(dbClearResult(res))
-
- out <- dbFetch(res, n)
- if (warn_incomplete) {
- res_warn_incomplete(res, "n = Inf")
- }
-
- grouped_df(out, groups(x))
-}
-
-# Do ---------------------------------------------------------------------------
-
-#' @export
-#' @rdname do
-#' @param .chunk_size The size of each chunk to pull into R. If this number is
-#' too big, the process will be slow because R has to allocate and free a lot
-#' of memory. If it's too small, it will be slow, because of the overhead of
-#' talking to the database.
-do_.tbl_sql <- function(.data, ..., .dots, .chunk_size = 1e4L) {
- group_by <- groups(.data)
- if (is.null(group_by)) stop("No grouping", call. = FALSE)
-
- args <- lazyeval::all_dots(.dots, ...)
- named <- named_args(args)
-
- # Create data frame of labels
- labels <- .data %>%
- select_(.dots = group_by) %>%
- summarise() %>%
- collect()
-
- n <- nrow(labels)
- m <- length(args)
-
- out <- replicate(m, vector("list", n), simplify = FALSE)
- names(out) <- names(args)
- p <- progress_estimated(n * m, min_time = 2)
- env <- new.env(parent = lazyeval::common_env(args))
-
- # Create ungrouped data frame suitable for chunked retrieval
- query <- query(.data$src$con, sql_render(ungroup(.data)), op_vars(.data))
-
- # When retrieving in pages, there's no guarantee we'll get a complete group.
- # So we always assume the last group in the chunk is incomplete, and leave
- # it for the next. If the group size is large than chunk size, it may
- # take a couple of iterations to get the entire group, but that should
- # be an unusual situation.
- last_group <- NULL
- i <- 0
- gvars <- seq_along(group_by)
-
- query$fetch_paged(.chunk_size, function(chunk) {
- if (!is.null(last_group)) {
- chunk <- rbind(last_group, chunk)
- }
-
- # Create an id for each group
- grouped <- chunk %>% group_by_(.dots = names(chunk)[gvars])
- index <- attr(grouped, "indices") # zero indexed
-
- last_group <<- chunk[index[[length(index)]] + 1L, , drop = FALSE]
-
- for (j in seq_len(n - 1)) {
- env$. <- chunk[index[[j]] + 1L, , drop = FALSE]
- for (k in seq_len(m)) {
- out[[k]][i + j] <<- list(eval(args[[k]]$expr, envir = env))
- p$tick()$print()
- }
- }
- i <<- i + (n - 1)
- })
-
- # Process last group
- if (!is.null(last_group)) {
- env$. <- last_group
- for (k in seq_len(m)) {
- out[[k]][i + 1] <- list(eval(args[[k]]$expr, envir = env))
- p$tick()$print()
- }
- }
-
- if (!named) {
- label_output_dataframe(labels, out, groups(.data))
- } else {
- label_output_list(labels, out, groups(.data))
- }
-}
-
diff --git a/R/tbl.r b/R/tbl.r
index 097e793..521d1c7 100644
--- a/R/tbl.r
+++ b/R/tbl.r
@@ -11,16 +11,16 @@ tbl <- function(src, ...) {
#' Create a "tbl" object
#'
-#' \code{tbl} is the standard constructor for tbls. \code{as.tbl} coerces,
-#' and \code{is.tbl} tests.
+#' `tbl()` is the standard constructor for tbls. `as.tbl()` coerces,
+#' and `is.tbl()` tests.
#'
#' @keywords internal
#' @export
#' @param subclass name of subclass. "tbl" is an abstract base class, so you
-#' must supply this value. \code{tbl_} is automatically prepended to the
+#' must supply this value. `tbl_` is automatically prepended to the
#' class name
#' @param object to test/coerce.
-#' @param ... For \code{tbl}, other fields used by class. For \code{as.tbl},
+#' @param ... For `tbl()`, other fields used by class. For `as.tbl()`,
#' other arguments passed to methods.
#' @examples
#' as.tbl(mtcars)
@@ -35,7 +35,7 @@ is.tbl <- function(x) inherits(x, "tbl")
#' @export
#' @rdname tbl
-#' @param x an object to coerce to a \code{tbl}
+#' @param x an object to coerce to a `tbl`
as.tbl <- function(x, ...) UseMethod("as.tbl")
#' @export
@@ -43,6 +43,19 @@ as.tbl.tbl <- function(x, ...) x
#' List variables provided by a tbl.
#'
+#' `tbl_vars()` returns all variables while `tbl_nongroup_vars()`
+#' returns only non-grouping variables.
+#'
#' @export
#' @param x A tbl object
-tbl_vars <- function(x) UseMethod("tbl_vars")
+#' @seealso [group_vars()] for a function that returns grouping
+#' variables.
+#' @keywords internal
+tbl_vars <- function(x) {
+ UseMethod("tbl_vars")
+}
+#' @rdname tbl_vars
+#' @export
+tbl_nongroup_vars <- function(x) {
+ setdiff(tbl_vars(x), group_vars(x))
+}
diff --git a/R/top-n.R b/R/top-n.R
index d63c04a..15fbd42 100644
--- a/R/top-n.R
+++ b/R/top-n.R
@@ -1,18 +1,23 @@
-#' Select top (or bottom) n rows (by value).
+#' Select top (or bottom) n rows (by value)
#'
-#' This is a convenient wrapper that uses \code{\link{filter}} and
-#' \code{\link{min_rank}} to select the top or bottom entries in each group,
-#' ordered by \code{wt}.
+#' This is a convenient wrapper that uses [filter()] and
+#' [min_rank()] to select the top or bottom entries in each group,
+#' ordered by `wt`.
#'
-#' @param x a \code{\link{tbl}} to filter
-#' @param n number of rows to return. If \code{x} is grouped, this is the
-#' number of rows per group. Will include more than \code{n} rows if
+#' @param x a [tbl()] to filter
+#' @param n number of rows to return. If `x` is grouped, this is the
+#' number of rows per group. Will include more than `n` rows if
#' there are ties.
#'
-#' If \code{n} is positive, selects the top \code{n} rows. If negative,
-#' selects the bottom \code{n} rows.
-#' @param wt (Optional). The variable to use for ordering. If not specified,
-#' defaults to the last variable in the tbl.
+#' If `n` is positive, selects the top `n` rows. If negative,
+#' selects the bottom `n` rows.
+#' @param wt (Optional). The variable to use for ordering. If not
+#' specified, defaults to the last variable in the tbl.
+#'
+#' This argument is automatically [quoted][rlang::quo] and later
+#' [evaluated][rlang::eval_tidy] in the context of the data
+#' frame. It supports [unquoting][rlang::quasiquotation]. See
+#' `vignette("programming")` for an introduction to these concepts.
#' @export
#' @examples
#' df <- data.frame(x = c(10, 4, 1, 6, 3, 1, 1))
@@ -35,22 +40,24 @@
#' tbl_df(Batting) %>% group_by(playerID) %>% top_n(1, G)
#' }
top_n <- function(x, n, wt) {
- if (missing(wt)) {
+ wt <- enquo(wt)
+
+ if (quo_is_missing(wt)) {
vars <- tbl_vars(x)
- message("Selecting by ", vars[length(vars)])
- wt <- as.name(vars[length(vars)])
- } else {
- wt <- substitute(wt)
+ wt_name <- vars[length(vars)]
+ inform(glue("Selecting by ", wt_name))
+ wt <- sym(wt_name)
+ }
+
+ if (!is_scalar_integerish(n)) {
+ abort("`n` must be a scalar integer")
}
- stopifnot(is.numeric(n), length(n) == 1)
if (n > 0) {
- call <- substitute(filter(x, min_rank(desc(wt)) <= n),
- list(n = n, wt = wt))
+ quo <- quo(filter(x, min_rank(desc(!! wt)) <= !! n))
} else {
- call <- substitute(filter(x, min_rank(wt) <= n),
- list(n = abs(n), wt = wt))
+ quo <- quo(filter(x, min_rank(!! wt) <= !! abs(n)))
}
- eval(call)
+ eval_tidy(quo)
}
diff --git a/R/translate-sql-base.r b/R/translate-sql-base.r
deleted file mode 100644
index ccfb0a3..0000000
--- a/R/translate-sql-base.r
+++ /dev/null
@@ -1,246 +0,0 @@
-#' @include translate-sql-helpers.r
-#' @include sql-escape.r
-NULL
-
-
-sql_if <- function(cond, if_true, if_false = NULL) {
- build_sql(
- "CASE WHEN (", cond, ")",
- " THEN (", if_true, ")",
- if (!is.null(if_false)) build_sql(" ELSE (", if_false, ")"),
- " END"
- )
-}
-
-#' @export
-#' @rdname sql_variant
-#' @format NULL
-base_scalar <- sql_translator(
- `+` = sql_infix("+"),
- `*` = sql_infix("*"),
- `/` = sql_infix("/"),
- `%%` = sql_infix("%"),
- `^` = sql_prefix("power", 2),
- `-` = function(x, y = NULL) {
- if (is.null(y)) {
- if (is.numeric(x)) {
- -x
- } else {
- build_sql(sql("-"), x)
- }
- } else {
- build_sql(x, sql(" - "), y)
- }
- },
-
- `!=` = sql_infix("!="),
- `==` = sql_infix("="),
- `<` = sql_infix("<"),
- `<=` = sql_infix("<="),
- `>` = sql_infix(">"),
- `>=` = sql_infix(">="),
-
- `!` = sql_prefix("not"),
- `&` = sql_infix("and"),
- `&&` = sql_infix("and"),
- `|` = sql_infix("or"),
- `||` = sql_infix("or"),
- xor = function(x, y) {
- sql(sprintf("%1$s OR %2$s AND NOT (%1$s AND %2$s)", escape(x), escape(y)))
- },
-
- abs = sql_prefix("abs", 1),
- acos = sql_prefix("acos", 1),
- acosh = sql_prefix("acosh", 1),
- asin = sql_prefix("asin", 1),
- asinh = sql_prefix("asinh", 1),
- atan = sql_prefix("atan", 1),
- atan2 = sql_prefix("atan2", 2),
- atanh = sql_prefix("atanh", 1),
- ceil = sql_prefix("ceil", 1),
- ceiling = sql_prefix("ceil", 1),
- cos = sql_prefix("cos", 1),
- cosh = sql_prefix("cosh", 1),
- cot = sql_prefix("cot", 1),
- coth = sql_prefix("coth", 1),
- exp = sql_prefix("exp", 1),
- floor = sql_prefix("floor", 1),
- log = function(x, base = exp(1)) {
- build_sql(sql("log"), list(x, base))
- },
- log10 = sql_prefix("log10", 1),
- round = sql_prefix("round", 2),
- sign = sql_prefix("sign", 1),
- sin = sql_prefix("sin", 1),
- sinh = sql_prefix("sinh", 1),
- sqrt = sql_prefix("sqrt", 1),
- tan = sql_prefix("tan", 1),
-
- tolower = sql_prefix("lower", 1),
- toupper = sql_prefix("upper", 1),
- nchar = sql_prefix("length", 1),
-
- `if` = sql_if,
- if_else = sql_if,
- ifelse = sql_if,
-
- sql = function(...) sql(...),
- `(` = function(x) {
- build_sql("(", x, ")")
- },
- `{` = function(x) {
- build_sql("(", x, ")")
- },
- desc = function(x) {
- build_sql(x, sql(" DESC"))
- },
-
- is.null = function(x) {
- build_sql("(", x, ") IS NULL")
- },
- is.na = function(x) {
- build_sql("(", x, ") IS NULL")
- },
- na_if = sql_prefix("NULL_IF", 2),
-
- as.numeric = function(x) build_sql("CAST(", x, " AS NUMERIC)"),
- as.integer = function(x) build_sql("CAST(", x, " AS INTEGER)"),
- as.character = function(x) build_sql("CAST(", x, " AS TEXT)"),
-
- c = function(...) escape(c(...)),
- `:` = function(from, to) escape(from:to),
-
- between = function(x, left, right) {
- build_sql(x, " BETWEEN ", left, " AND ", right)
- },
-
- pmin = sql_prefix("min"),
- pmax = sql_prefix("max"),
-
- `__dplyr_colwise_fun` = function(...) {
- stop("colwise verbs only accept bare functions with local sources",
- call. = FALSE)
- }
-)
-
-base_symbols <- sql_translator(
- pi = sql("PI()"),
- `*` = sql("*"),
- `NULL` = sql("NULL")
-)
-
-#' @export
-#' @rdname sql_variant
-#' @format NULL
-base_agg <- sql_translator(
- # SQL-92 aggregates
- # http://db.apache.org/derby/docs/10.7/ref/rrefsqlj33923.html
- n = sql_prefix("count"),
- mean = sql_prefix("avg", 1),
- var = sql_prefix("variance", 1),
- sum = sql_prefix("sum", 1),
- min = sql_prefix("min", 1),
- max = sql_prefix("max", 1),
- n_distinct = function(x) {
- build_sql("COUNT(DISTINCT ", x, ")")
- }
-)
-
-#' @export
-#' @rdname sql_variant
-#' @format NULL
-base_win <- sql_translator(
- # rank functions have a single order argument that overrides the default
- row_number = win_rank("row_number"),
- min_rank = win_rank("rank"),
- rank = win_rank("rank"),
- dense_rank = win_rank("dense_rank"),
- percent_rank = win_rank("percent_rank"),
- cume_dist = win_rank("cume_dist"),
- ntile = function(order_by, n) {
- over(
- build_sql("NTILE", list(as.integer(n))),
- partition_group(),
- order_by %||% partition_order()
- )
- },
-
- # Recycled aggregate fuctions take single argument, don't need order and
- # include entire partition in frame.
- mean = win_recycled("avg"),
- sum = win_recycled("sum"),
- min = win_recycled("min"),
- max = win_recycled("max"),
- n = function() {
- over(sql("COUNT(*)"), partition_group())
- },
-
- # Cumulative function are like recycled aggregates except that R names
- # have cum prefix, order_by is inherited and frame goes from -Inf to 0.
- cummean = win_cumulative("mean"),
- cumsum = win_cumulative("sum"),
- cummin = win_cumulative("min"),
- cummax = win_cumulative("max"),
-
- # Finally there are a few miscellaenous functions that don't follow any
- # particular pattern
- nth = function(x, order = NULL) {
- over(build_sql("NTH_VALUE", list(x)), partition_group(), order %||% partition$order())
- },
- first = function(x, order = NULL) {
- over(build_sql("FIRST_VALUE", list(x)), partition_group(), order %||% partition_order())
- },
- last = function(x, order = NULL) {
- over(build_sql("LAST_VALUE", list(x)), partition_group(), order %||% partition_order())
- },
-
- lead = function(x, n = 1L, default = NA, order = NULL) {
- over(
- build_sql("LEAD", list(x, n, default)),
- partition_group(),
- order %||% partition_order()
- )
- },
- lag = function(x, n = 1L, default = NA, order = NULL) {
- over(
- build_sql("LAG", list(x, n, default)),
- partition_group(),
- order %||% partition_order()
- )
- },
-
- order_by = function(order_by, expr) {
- old <- set_partition(partition_group(), order_by)
- on.exit(set_partition(old))
-
- expr
- }
-)
-
-#' @export
-#' @rdname sql_variant
-#' @format NULL
-base_no_win <- sql_translator(
- row_number = win_absent("row_number"),
- min_rank = win_absent("rank"),
- rank = win_absent("rank"),
- dense_rank = win_absent("dense_rank"),
- percent_rank = win_absent("percent_rank"),
- cume_dist = win_absent("cume_dist"),
- ntile = win_absent("ntile"),
- mean = win_absent("avg"),
- sum = win_absent("sum"),
- min = win_absent("min"),
- max = win_absent("max"),
- n = win_absent("n"),
- cummean = win_absent("mean"),
- cumsum = win_absent("sum"),
- cummin = win_absent("min"),
- cummax = win_absent("max"),
- nth = win_absent("nth_value"),
- first = win_absent("first_value"),
- last = win_absent("last_value"),
- lead = win_absent("lead"),
- lag = win_absent("lag"),
- order_by = win_absent("order_by")
-)
diff --git a/R/translate-sql-helpers.r b/R/translate-sql-helpers.r
deleted file mode 100644
index 045eda5..0000000
--- a/R/translate-sql-helpers.r
+++ /dev/null
@@ -1,220 +0,0 @@
-#' Create an sql translator
-#'
-#' When creating a package that maps to a new SQL based src, you'll often
-#' want to provide some additional mappings from common R commands to the
-#' commands that your tbl provides. These three functions make that
-#' easy.
-#'
-#' @section Helper functions:
-#'
-#' \code{sql_infix} and \code{sql_prefix} create default SQL infix and prefix
-#' functions given the name of the SQL function. They don't perform any input
-#' checking, but do correctly escape their input, and are useful for
-#' quickly providing default wrappers for a new SQL variant.
-#'
-#' @keywords internal
-#' @param scalar,aggregate,window The three families of functions than an
-#' SQL variant can supply.
-#' @param ...,.funs named functions, used to add custom converters from standard
-#' R functions to sql functions. Specify individually in \code{...}, or
-#' provide a list of \code{.funs}
-#' @param .parent the sql variant that this variant should inherit from.
-#' Defaults to \code{base_sql} which provides a standard set of
-#' mappings for the most common operators and functions.
-#' @param f the name of the sql function as a string
-#' @param n for \code{sql_infix}, an optional number of arguments to expect.
-#' Will signal error if not correct.
-#' @seealso \code{\link{sql}} for an example of a more customised sql
-#' conversion function.
-#' @export
-#' @examples
-#' # An example of adding some mappings for the statistical functions that
-#' # postgresql provides: http://bit.ly/K5EdTn
-#'
-#' postgres_agg <- sql_translator(.parent = base_agg,
-#' cor = sql_prefix("corr"),
-#' cov = sql_prefix("covar_samp"),
-#' sd = sql_prefix("stddev_samp"),
-#' var = sql_prefix("var_samp")
-#' )
-#' postgres_var <- sql_variant(
-#' base_scalar,
-#' postgres_agg
-#' )
-#'
-#' translate_sql(cor(x, y), variant = postgres_var)
-#' translate_sql(sd(income / years), variant = postgres_var)
-#'
-#' # Any functions not explicitly listed in the converter will be translated
-#' # to sql as is, so you don't need to convert all functions.
-#' translate_sql(regr_intercept(y, x), variant = postgres_var)
-sql_variant <- function(scalar = sql_translator(),
- aggregate = sql_translator(),
- window = sql_translator()) {
- stopifnot(is.environment(scalar))
- stopifnot(is.environment(aggregate))
- stopifnot(is.environment(window))
-
- structure(list(scalar = scalar, aggregate = aggregate, window = window),
- class = "sql_variant")
-}
-
-is.sql_variant <- function(x) inherits(x, "sql_variant")
-
-#' @export
-print.sql_variant <- function(x, ...) {
- wrap_ls <- function(x, ...) {
- vars <- sort(ls(envir = x))
- wrapped <- strwrap(paste0(vars, collapse = ", "), ...)
- if (identical(wrapped, "")) return()
- paste0(wrapped, "\n", collapse = "")
- }
-
- cat("<sql_variant>\n")
- cat(wrap_ls(x$scalar, prefix = "scalar: "))
- cat(wrap_ls(x$aggregate, prefix = "aggregate: "))
- cat(wrap_ls(x$window, prefix = "window: "))
-}
-
-#' @export
-names.sql_variant <- function(x) {
- c(ls(envir = x$scalar), ls(envir = x$aggregate), ls(envir = x$window))
-}
-
-#' @export
-#' @rdname sql_variant
-sql_translator <- function(..., .funs = list(),
- .parent = new.env(parent = emptyenv())) {
- funs <- c(list(...), .funs)
- if (length(funs) == 0) return(.parent)
-
- list2env(funs, copy_env(.parent))
-}
-
-copy_env <- function(from, to = NULL, parent = parent.env(from)) {
- list2env(as.list(from), envir = to, parent = parent)
-}
-
-#' @rdname sql_variant
-#' @export
-sql_infix <- function(f) {
- assert_that(is.string(f))
-
- f <- toupper(f)
- function(x, y) {
- build_sql(x, " ", sql(f), " ", y)
- }
-}
-
-#' @rdname sql_variant
-#' @export
-sql_prefix <- function(f, n = NULL) {
- assert_that(is.string(f))
-
- f <- toupper(f)
- function(..., na.rm) {
- if (!missing(na.rm)) {
- message("na.rm not needed in SQL: NULL are always dropped", call. = FALSE)
- }
-
- args <- list(...)
- if (!is.null(n) && length(args) != n) {
- stop("Invalid number of args to SQL ", f, ". Expecting ", n,
- call. = FALSE)
- }
- if (any(names2(args) != "")) {
- warning("Named arguments ignored for SQL ", f, call. = FALSE)
- }
- build_sql(sql(f), args)
- }
-}
-
-#' @rdname sql_variant
-#' @export
-sql_not_supported <- function(f) {
- assert_that(is.string(f))
-
- f <- toupper(f)
- function(...) {
- stop(f, " is not available in this SQL variant", call. = FALSE)
- }
-}
-
-win_rank <- function(f) {
- force(f)
- function(order = NULL) {
- over(build_sql(sql(f), list()), partition_group(), order %||% partition_order())
- }
-}
-win_recycled <- function(f) {
- force(f)
- function(x) {
- over(build_sql(sql(f), list(x)), partition_group())
- }
-}
-win_cumulative <- function(f) {
- force(f)
- function(x) {
- over(build_sql(sql(f), list(x)), partition_group(), partition_order(), frame = c(-Inf, 0))
- }
-}
-
-win_absent <- function(f) {
- force(f)
-
- function(...) {
- stop(
- "Window function `", f, "()` is not supported by this database",
- call. = FALSE
- )
- }
-}
-
-
-# Use a global variable to communicate state of partitioning between
-# tbl and sql translator. This isn't the most amazing design, but it keeps
-# things loosely coupled and is straightforward to understand.
-partition <- new.env(parent = emptyenv())
-partition$group_by <- NULL
-partition$order_by <- NULL
-partition$con <- NULL
-
-set_partition_con <- function(con) {
- old <- partition$con
- partition$con <- con
- invisible(old)
-}
-
-set_partition_group <- function(vars) {
- stopifnot(is.null(vars) || is.character(vars))
-
- old <- partition$group_by
- partition$group_by <- vars
- invisible(old)
-}
-
-set_partition_order <- function(vars) {
- stopifnot(is.null(vars) || is.character(vars))
-
- old <- partition$order_by
- partition$order_by <- vars
- invisible(old)
-}
-
-set_partition <- function(group_by, order_by, con = NULL) {
- old <- list(partition$group_by, partition$order_by)
- if (is.list(group_by)) {
- order_by <- group_by[[2]]
- group_by <- group_by[[1]]
- }
-
- partition$group_by <- group_by
- partition$order_by <- order_by
- partition$con <- con
-
- invisible(old)
-}
-
-partition_group <- function() partition$group_by
-partition_order <- function() partition$order_by
-partition_con <- function() partition$con
diff --git a/R/translate-sql-window.r b/R/translate-sql-window.r
deleted file mode 100644
index 15048a2..0000000
--- a/R/translate-sql-window.r
+++ /dev/null
@@ -1,66 +0,0 @@
-uses_window_fun <- function(x, con) {
- if (is.null(x)) return(FALSE)
- if (inherits(x, "lazy_dots")) {
- calls <- unlist(lapply(x, function(x) all_calls(x$expr)))
- } else {
- calls <- all_calls(x)
- }
-
- win_f <- ls(envir = sql_translate_env(con)$window)
- any(calls %in% win_f)
-}
-
-common_window_funs <- ls(sql_translate_env(NULL)$window)
-
-#' @noRd
-#' @examples
-#' translate_window_where(quote(1))
-#' translate_window_where(quote(x))
-#' translate_window_where(quote(x == 1))
-#' translate_window_where(quote(x == 1 && y == 2))
-#' translate_window_where(quote(n() > 10))
-#' translate_window_where(quote(rank() > cumsum(AB)))
-translate_window_where <- function(expr, window_funs = common_window_funs) {
- if (is.atomic(expr) || is.name(expr)) {
- window_where(expr, list())
- } else if (is.call(expr)) {
- if (as.character(expr[[1]]) %in% window_funs) {
- name <- unique_name()
- window_where(as.name(name), setNames(list(expr), name))
- } else {
- args <- lapply(expr[-1], translate_window_where, window_funs = window_funs)
- expr <- as.call(c(expr[[1]], lapply(args, "[[", "expr")))
-
- window_where(
- expr = expr,
- comp = unlist(lapply(args, "[[", "comp"), recursive = FALSE)
- )
- }
- } else {
- stop("Unknown type: ", typeof(expr))
- }
-}
-
-
-#' @noRd
-#' @examples
-#' translate_window_where_all(list(quote(x == 1), quote(n() > 2)))
-#' translate_window_where_all(list(quote(cumsum(x) == 10), quote(n() > 2)))
-translate_window_where_all <- function(x, window_funs = common_window_funs) {
- out <- lapply(x, translate_window_where, window_funs = window_funs)
-
- list(
- expr = unlist(lapply(out, "[[", "expr"), recursive = FALSE),
- comp = unlist(lapply(out, "[[", "comp"), recursive = FALSE)
- )
-}
-
-window_where <- function(expr, comp) {
- stopifnot(is.call(expr) || is.name(expr) || is.atomic(expr))
- stopifnot(is.list(comp))
-
- list(
- expr = expr,
- comp = comp
- )
-}
diff --git a/R/translate-sql.r b/R/translate-sql.r
deleted file mode 100644
index 386fac6..0000000
--- a/R/translate-sql.r
+++ /dev/null
@@ -1,233 +0,0 @@
-#' Translate an expression to sql.
-#'
-#' @section Base translation:
-#' The base translator, \code{base_sql},
-#' provides custom mappings for \code{!} (to NOT), \code{&&} and \code{&} to
-#' \code{AND}, \code{||} and \code{|} to \code{OR}, \code{^} to \code{POWER},
-#' \code{\%>\%} to \code{\%}, \code{ceiling} to \code{CEIL}, \code{mean} to
-#' \code{AVG}, \code{var} to \code{VARIANCE}, \code{tolower} to \code{LOWER},
-#' \code{toupper} to \code{UPPER} and \code{nchar} to \code{length}.
-#'
-#' \code{c} and \code{:} keep their usual R behaviour so you can easily create
-#' vectors that are passed to sql.
-#'
-#' All other functions will be preserved as is. R's infix functions
-#' (e.g. \code{\%like\%}) will be converted to their sql equivalents
-#' (e.g. \code{LIKE}). You can use this to access SQL string concatenation:
-#' \code{||} is mapped to \code{OR}, but \code{\%||\%} is mapped to \code{||}.
-#' To suppress this behaviour, and force errors immediately when dplyr doesn't
-#' know how to translate a function it encounters, using set the
-#' \code{dplyr.strict_sql} option to \code{TRUE}.
-#'
-#' You can also use \code{sql} to insert a raw sql string.
-#'
-#' @section SQLite translation:
-#' The SQLite variant currently only adds one additional function: a mapping
-#' from \code{sd} to the SQL aggregation function \code{stdev}.
-#'
-#' @param ...,dots Expressions to translate. \code{sql_translate}
-#' automatically quotes them for you. \code{sql_translate_} expects
-#' a list of already quoted objects.
-#' @param con An optional database connection to control the details of
-#' the translation. The default, \code{NULL}, generates ANSI SQL.
-#' @param vars A character vector giving variable names in the remote
-#' data source. If this is supplied, \code{translate_sql} will call
-#' \code{\link{partial_eval}} to interpolate in the values from local
-#' variables.
-#' @param vars_group,vars_order Grouping and ordering variables used for
-#' windowed functions.
-#' @param window Use \code{FALSE} to suppress generation of the \code{OVER}
-#' statement used for window functions. This is necessary when generating
-#' SQL for a grouped summary.
-#' @export
-#' @examples
-#' # Regular maths is translated in a very straightforward way
-#' translate_sql(x + 1)
-#' translate_sql(sin(x) + tan(y))
-#'
-#' # Note that all variable names are escaped
-#' translate_sql(like == "x")
-#' # In ANSI SQL: "" quotes variable _names_, '' quotes strings
-#'
-#' # Logical operators are converted to their sql equivalents
-#' translate_sql(x < 5 & !(y >= 5))
-#' # xor() doesn't have a direct SQL equivalent
-#' translate_sql(xor(x, y))
-#'
-#' # If is translated into case when
-#' translate_sql(if (x > 5) "big" else "small")
-#'
-#' # Infix functions are passed onto SQL with % removed
-#' translate_sql(first %like% "Had*")
-#' translate_sql(first %is% NULL)
-#' translate_sql(first %in% c("John", "Roger", "Robert"))
-#'
-#'
-#' # And be careful if you really want integers
-#' translate_sql(x == 1)
-#' translate_sql(x == 1L)
-#'
-#' # If you have an already quoted object, use translate_sql_:
-#' x <- quote(y + 1 / sin(t))
-#' translate_sql_(list(x))
-#'
-#' # Translation with known variables ------------------------------------------
-#'
-#' # If the variables in the dataset are known, translate_sql will interpolate
-#' # in literal values from the current environment
-#' x <- 10
-#' translate_sql(mpg > x)
-#' translate_sql(mpg > x, vars = names(mtcars))
-#'
-#' # By default all computations happens in sql
-#' translate_sql(cyl == 2 + 2, vars = names(mtcars))
-#' # Use local to force local evaluation
-#' translate_sql(cyl == local(2 + 2), vars = names(mtcars))
-#'
-#' # This is also needed if you call a local function:
-#' inc <- function(x) x + 1
-#' translate_sql(mpg > inc(x), vars = names(mtcars))
-#' translate_sql(mpg > local(inc(x)), vars = names(mtcars))
-#'
-#' # Windowed translation --------------------------------------------
-#' # Known window functions automatically get OVER()
-#' translate_sql(mpg > mean(mpg))
-#'
-#' # Suppress this with window = FALSE
-#' translate_sql(mpg > mean(mpg), window = FALSE)
-#'
-#' # vars_group controls partition:
-#' translate_sql(mpg > mean(mpg), vars_group = "cyl")
-#'
-#' # and vars_order controls ordering for those functions that need it
-#' translate_sql(cumsum(mpg))
-#' translate_sql(cumsum(mpg), vars_order = "mpg")
-translate_sql <- function(...,
- con = NULL,
- vars = character(),
- vars_group = NULL,
- vars_order = NULL,
- window = TRUE) {
- dots <- lazyeval::lazy_dots(...)
-
- translate_sql_(dots,
- con = con,
- vars = vars,
- vars_group = vars_group,
- vars_order = vars_order,
- window = window
- )
-}
-
-#' @export
-#' @rdname translate_sql
-translate_sql_ <- function(dots,
- con = NULL,
- vars = character(),
- vars_group = NULL,
- vars_order = NULL,
- window = TRUE) {
- expr <- lazyeval::as.lazy_dots(dots, env = parent.frame())
- if (!any(has_names(expr))) {
- names(expr) <- NULL
- }
-
- if (length(vars) > 0) {
- # If variables are known, partially evaluate input
- expr <- partial_eval2(expr, vars)
- } else {
- # Otherwise just extract expressions, ignoring the environment
- # from which they came
- expr <- lapply(expr, "[[", "expr")
- }
- variant <- sql_translate_env(con)
-
- if (window) {
- old_con <- set_partition_con(con)
- on.exit(set_partition_con(old_con), add = TRUE)
-
- old_group <- set_partition_group(vars_group)
- on.exit(set_partition_group(old_group), add = TRUE)
-
- old_order <- set_partition_order(vars_order)
- on.exit(set_partition_order(old_order), add = TRUE)
- }
-
- pieces <- lapply(expr, function(x) {
- if (is.atomic(x)) return(escape(x, con = con))
-
- env <- sql_env(x, variant, con, window = window)
- escape(eval(x, envir = env))
- })
-
- sql(unlist(pieces))
-}
-
-sql_env <- function(expr, variant, con, window = FALSE,
- strict = getOption("dplyr.strict_sql")) {
- stopifnot(is.sql_variant(variant))
-
- # Default for unknown functions
- if (!strict) {
- unknown <- setdiff(all_calls(expr), names(variant))
- default_env <- ceply(unknown, default_op, parent = emptyenv())
- } else {
- default_env <- new.env(parent = emptyenv())
- }
-
-
- # Known R -> SQL functions
- special_calls <- copy_env(variant$scalar, parent = default_env)
- if (!window) {
- special_calls2 <- copy_env(variant$aggregate, parent = special_calls)
- } else {
- special_calls2 <- copy_env(variant$window, parent = special_calls)
- }
-
- # Existing symbols in expression
- names <- all_names(expr)
- name_env <- ceply(names, function(x) escape(ident(x), con = con),
- parent = special_calls2)
-
- # Known sql expressions
- symbol_env <- copy_env(base_symbols, parent = name_env)
- symbol_env
-}
-
-default_op <- function(x) {
- assert_that(is.string(x))
- infix <- c("::", "$", "@", "^", "*", "/", "+", "-", ">", ">=", "<", "<=",
- "==", "!=", "!", "&", "&&", "|", "||", "~", "<-", "<<-")
-
- if (x %in% infix) {
- sql_infix(x)
- } else if (grepl("^%.*%$", x)) {
- x <- substr(x, 2, nchar(x) - 1)
- sql_infix(x)
- } else {
- sql_prefix(x)
- }
-}
-
-
-all_calls <- function(x) {
- if (!is.call(x)) return(NULL)
-
- fname <- as.character(x[[1]])
- unique(c(fname, unlist(lapply(x[-1], all_calls), use.names = FALSE)))
-}
-
-all_names <- function(x) {
- if (is.name(x)) return(as.character(x))
- if (!is.call(x)) return(NULL)
-
- unique(unlist(lapply(x[-1], all_names), use.names = FALSE))
-}
-
-# character vector -> environment
-ceply <- function(x, f, ..., parent = parent.frame()) {
- if (length(x) == 0) return(new.env(parent = parent))
- l <- lapply(x, f, ...)
- names(l) <- x
- list2env(l, parent = parent)
-}
diff --git a/R/ts.R b/R/ts.R
new file mode 100644
index 0000000..3092cdb
--- /dev/null
+++ b/R/ts.R
@@ -0,0 +1,4 @@
+#' @export
+filter.ts <- function(.data, ...) {
+ bad_args(".data", "must be a data source, not a ts object, do you want `stats::filter()`?")
+}
diff --git a/R/utils-expr.R b/R/utils-expr.R
new file mode 100644
index 0000000..487e463
--- /dev/null
+++ b/R/utils-expr.R
@@ -0,0 +1,50 @@
+
+expr_type_of <- function(x) {
+ type <- typeof(x)
+ if (type %in% c("symbol", "language", "pairlist", "NULL")) {
+ type
+ } else {
+ "literal"
+ }
+}
+switch_expr <- function(.x, ...) {
+ switch(expr_type_of(.x), ...)
+}
+
+node_walk_replace <- function(node, old, new) {
+ while(!is_null(node)) {
+ switch_expr(node_car(node),
+ language = node_walk_replace(node_cdar(node), old, new),
+ symbol = if (identical(node_car(node), old)) mut_node_car(node, new)
+ )
+ node <- node_cdr(node)
+ }
+}
+expr_substitute <- function(expr, old, new) {
+ expr <- duplicate(expr)
+ switch_type(expr,
+ formula = ,
+ language = node_walk_replace(node_cdr(expr), old, new),
+ symbol = if (identical(expr, old)) return(new)
+ )
+ expr
+}
+
+sym_dollar <- quote(`$`)
+sym_brackets2 <- quote(`[[`)
+is_data_pronoun <- function(expr) {
+ is_lang(expr, list(sym_dollar, sym_brackets2)) &&
+ identical(node_cadr(expr), quote(.data))
+}
+tidy_text <- function(quo, width = 60L) {
+ expr <- f_rhs(quo)
+ if (is_data_pronoun(expr)) {
+ as_string(node_cadr(node_cdr(expr)))
+ } else {
+ quo_text(quo, width = width)
+ }
+}
+named_quos <- function(...) {
+ quos <- quos(...)
+ exprs_auto_name(quos, printer = tidy_text)
+}
diff --git a/R/utils-format.r b/R/utils-format.r
index d74ac41..f6e2555 100644
--- a/R/utils-format.r
+++ b/R/utils-format.r
@@ -1,7 +1,7 @@
#' Describing dimensions
#'
#' Prints the dimensions of an array-like object in a user-friendly manner,
-#' substituting \code{NA} with ?? (for SQL queries).
+#' substituting `NA` with ?? (for SQL queries).
#'
#' @param x Object to show dimensions for.
#' @export
@@ -13,13 +13,17 @@ dim_desc <- function(x) {
d2 <- big_mark(d)
d2[is.na(d)] <- "??"
- paste0("[", paste0(d2, collapse = " x "), "]")
+ fmt_dims(d2)
}
wrap <- function(..., indent = 0) {
x <- paste0(..., collapse = "")
- wrapped <- strwrap(x, indent = indent, exdent = indent + 2,
- width = getOption("width"))
+ wrapped <- strwrap(
+ x,
+ indent = indent,
+ exdent = indent + 2,
+ width = getOption("width")
+ )
paste0(wrapped, collapse = "\n")
}
diff --git a/R/utils-replace-with.R b/R/utils-replace-with.R
index 25b01e2..149e855 100644
--- a/R/utils-replace-with.R
+++ b/R/utils-replace-with.R
@@ -1,13 +1,14 @@
-
-replace_with <- function(x, i, val, name) {
+replace_with <- function(x, i, val, name, reason = NULL) {
if (is.null(val)) {
return(x)
}
- check_length(val, x, name)
+ check_length(val, x, name, reason)
check_type(val, x, name)
check_class(val, x, name)
+ i[is.na(i)] <- FALSE
+
if (length(val) == 1L) {
x[i] <- val
} else {
@@ -17,31 +18,38 @@ replace_with <- function(x, i, val, name) {
x
}
-check_length <- function(x, template, name = deparse(substitute(x))) {
- n <- length(template)
- if (length(x) == n) {
- return()
- }
+check_length <- function(x, template, header, reason = NULL) {
+ check_length_val(length(x), length(template), header, reason)
+}
- if (length(x) == 1L) {
+check_length_col <- function(length_x, n, name, reason = NULL, .abort = abort) {
+ check_length_val(length_x, n, fmt_cols(name), reason, .abort = .abort)
+}
+
+check_length_val <- function(length_x, n, header, reason = NULL, .abort = abort) {
+ if (all(length_x %in% c(1L, n))) {
return()
}
- stop(name, " is length ", length(x), " not 1 or ", n, ".", call. = FALSE)
+ if (is.null(reason)) reason <- ""
+ else reason <- glue(" ({reason})")
+
+ if (n == 1) {
+ glubort(header, "must be length 1{reason}, not {commas(length_x)}", .abort = .abort)
+ } else {
+ glubort(header, "must be length {n}{reason} or one, not {commas(length_x)}", .abort = .abort)
+ }
}
-check_type <- function(x, template, name = deparse(substitute(x))) {
+check_type <- function(x, template, header) {
if (identical(typeof(x), typeof(template))) {
return()
}
- stop(
- name, " has type '", typeof(x), "' not '", typeof(template), "'",
- call. = FALSE
- )
+ glubort(header, "must be type {type_of(template)}, not {typeof(x)}")
}
-check_class <- function(x, template, name = deparse(substitute(x))) {
+check_class <- function(x, template, header) {
if (!is.object(x)) {
return()
}
@@ -50,6 +58,5 @@ check_class <- function(x, template, name = deparse(substitute(x))) {
return()
}
- stop(name, " has class ", paste(class(x), collapse = "/"), " not ",
- paste(class(template), collapse = "/"), call. = FALSE)
+ glubort(header, "must be {fmt_classes(template)}, not {fmt_classes(x)}")
}
diff --git a/R/utils.r b/R/utils.r
index 0f1290b..b6ab44d 100644
--- a/R/utils.r
+++ b/R/utils.r
@@ -3,23 +3,7 @@
magrittr::`%>%`
dots <- function(...) {
- eval(substitute(alist(...)))
-}
-
-named_dots <- function(...) {
- auto_name(dots(...))
-}
-
-auto_names <- function(x) {
- nms <- names2(x)
- missing <- nms == ""
- if (all(!missing)) return(nms)
-
- deparse2 <- function(x) paste(deparse(x, 500L), collapse = "")
- defaults <- vapply(x[missing], deparse2, character(1), USE.NAMES = FALSE)
-
- nms[missing] <- defaults
- nms
+ eval_bare(substitute(alist(...)))
}
deparse_trunc <- function(x, width = getOption("width")) {
@@ -29,41 +13,13 @@ deparse_trunc <- function(x, width = getOption("width")) {
paste0(substr(text[1], 1, width - 3), "...")
}
-auto_name <- function(x) {
- names(x) <- auto_names(x)
- x
-}
-
-is.lang <- function(x) {
- is.name(x) || is.atomic(x) || is.call(x)
-}
-is.lang.list <- function(x) {
- if (is.null(x)) return(TRUE)
-
- is.list(x) && all_apply(x, is.lang)
-}
-on_failure(is.lang.list) <- function(call, env) {
- paste0(call$x, " is not a list containing only names, calls and atomic vectors")
-}
-
-only_has_names <- function(x, nms) {
- all(names(x) %in% nms)
-}
-on_failure(all_names) <- function(call, env) {
- x_nms <- names(eval(call$x, env))
- nms <- eval(call$nms, env)
- extra <- setdiff(x_nms, nms)
-
- paste0(call$x, " has named components: ", paste0(extra, collapse = ", "), ".",
- "Should only have names: ", paste0(nms, collapse = ","))
-}
-
all_apply <- function(xs, f) {
for (x in xs) {
if (!f(x)) return(FALSE)
}
TRUE
}
+
any_apply <- function(xs, f) {
for (x in xs) {
if (f(x)) return(TRUE)
@@ -76,50 +32,18 @@ drop_last <- function(x) {
x[-length(x)]
}
-compact <- function(x) Filter(Negate(is.null), x)
-
-names2 <- function(x) {
- names(x) %||% rep("", length(x))
-}
-
-has_names <- function(x) {
- nms <- names(x)
- if (is.null(nms)) {
- rep(FALSE, length(x))
- } else {
- !is.na(nms) & nms != ""
- }
-}
-
-"%||%" <- function(x, y) if(is.null(x)) y else x
-
-is.wholenumber <- function(x, tol = .Machine$double.eps ^ 0.5) {
- abs(x - round(x)) < tol
-}
-
-as_df <- function(x) {
- class(x) <- "data.frame"
- attr(x, "row.names") <- c(NA_integer_, -length(x[[1]]))
-
- x
+is.wholenumber <- function(x) {
+ trunc(x) == x
}
deparse_all <- function(x) {
- deparse2 <- function(x) paste(deparse(x, width.cutoff = 500L), collapse = "")
- vapply(x, deparse2, FUN.VALUE = character(1))
+ x <- map_if(x, is_formula, f_rhs)
+ map_chr(x, expr_text, width = 500L)
}
-#' Provides comma-separated string out ot the parameters
-#' @export
-#' @keywords internal
-#' @param ... Arguments to be constructed into the string
-named_commas <- function(...) {
- x <- c(...)
- if (is.null(names(x))) {
- paste0(x, collapse = ", ")
- } else {
- paste0(names(x), " = ", x, collapse = ", ")
- }
+deparse_names <- function(x) {
+ x <- map_if(x, is_formula, f_rhs)
+ map_chr(x, deparse)
}
commas <- function(...) paste0(..., collapse = ", ")
@@ -144,31 +68,52 @@ unique_name <- local({
}
})
-isFALSE <- function(x) identical(x, FALSE)
-
-f_lhs <- function(x) if (length(x) >= 3) x[[2]] else NULL
-f_rhs <- function(x) x[[length(x)]]
+succeeds <- function(x, quiet = FALSE) {
+ tryCatch(
+ {
+ x
+ TRUE
+ },
+ error = function(e) {
+ if (!quiet)
+ inform(paste0("Error: ", e$message))
+ FALSE
+ }
+ )
+}
+is_1d <- function(x) {
+ # dimension check is for matrices and data.frames
+ (is_atomic(x) || is.list(x)) && length(dim(x)) <= 1
+}
-substitute_q <- function(x, env) {
- call <- substitute(substitute(x, env), list(x = x))
- eval(call)
+is_negated <- function(x) {
+ is_lang(x, "-", n = 1)
}
+inc_seq <- function(from, to) {
+ if (from > to) {
+ integer()
+ } else {
+ seq.int(from, to)
+ }
+}
-succeeds <- function(x, quiet = FALSE) {
- tryCatch({x; TRUE}, error = function(e) {
- if (!quiet) message("Error: ", e$message)
- FALSE
- })
+random_table_name <- function(n = 10) {
+ paste0(sample(letters, n, replace = TRUE), collapse = "")
}
-# is.atomic() is TRUE for atomic vectors AND NULL!
-is_atomic <- function(x) is.atomic(x) && !is.null(x)
+attr_equal <- function(x, y) {
+ attr_x <- attributes(x)
+ if (!is.null(attr_x)) {
+ attr_x <- attr_x[sort(names(attr_x))]
+ }
-is_1d <- function(x) {
- # dimension check is for matrices and data.frames
- (is_atomic(x) || is.list(x)) && length(dim(x)) <= 1
+ attr_y <- attributes(y)
+ if (!is.null(attr_y)) {
+ attr_y <- attr_y[sort(names(attr_y))]
+ }
+
+ isTRUE(all.equal(attr_x, attr_y))
}
-is_bare_list <- function(x) is.list(x) && !is.object(x)
diff --git a/R/view.r b/R/view.r
deleted file mode 100644
index 241fb5d..0000000
--- a/R/view.r
+++ /dev/null
@@ -1,47 +0,0 @@
-# data(baseball, package = "plyr")
-# players <- group_by(baseball, id)
-# v <- view(players$obj, players$index)
-# v$set_group(1)
-# v$eval(quote(id))
-view <- function(data, index, parent = parent.frame()) {
- # Current group and rows
- i <- 1
- rows <- index[[1]]
-
- set_group <- function(value) {
- if (i == value) return(rows)
- i <<- value
- rows <<- index[[value]]
- rows
- }
-
- # Tools to manage extra functions
- fun_env <- new.env(parent = parent)
- add_function <- function(name, fun) {
- fun_env[[name]] <- fun
- }
-
- # Tools to manage active bindings
- grp_env <- new.env(parent = fun_env, size = nrow(data))
- add_binding <- function(name, fun) {
- makeActiveBinding(name, fun, grp_env)
- }
- from_data <- function(col) {
- force(col)
- function(v) {
- if (!missing(v)) stop("Immutable view")
- .subset2(data, col)[rows]
- }
- }
-
- for (name in names(data)) {
- add_binding(name, from_data(name))
- }
-
- local_eval <- function(expr) {
- eval(expr, grp_env)
- }
-
- list(set_group = set_group, eval = local_eval, add_function = add_function,
- add_binding = add_binding)
-}
diff --git a/R/zzz.r b/R/zzz.r
index c103d33..23d28a3 100644
--- a/R/zzz.r
+++ b/R/zzz.r
@@ -1,27 +1,15 @@
.onLoad <- function(libname, pkgname) {
op <- options()
op.dplyr <- list(
- dplyr.strict_sql = FALSE,
dplyr.show_progress = TRUE
)
toset <- !(names(op.dplyr) %in% names(op))
- if(any(toset)) options(op.dplyr[toset])
+ if (any(toset)) options(op.dplyr[toset])
invisible()
}
.onAttach <- function(libname, pkgname) {
- when_attached("data.table", {
- if (!is_attached("dtplyr")) {
- packageStartupMessage(rule())
- packageStartupMessage(
- "data.table + dplyr code now lives in dtplyr.\n",
- "Please library(dtplyr)!"
- )
- packageStartupMessage(rule())
- }
- })
-
setHook(packageEvent("plyr", "attach"), function(...) {
packageStartupMessage(rule())
packageStartupMessage("You have loaded plyr after dplyr - this is likely ",
diff --git a/README.md b/README.md
index 64c5d56..bbf600c 100755
--- a/README.md
+++ b/README.md
@@ -1,209 +1,113 @@
-<!-- README.md is generated from README.Rmd. Please edit that file -->
-dplyr
-=====
-
-[![Build Status](https://travis-ci.org/hadley/dplyr.png?branch=master)](https://travis-ci.org/hadley/dplyr) [![CRAN\_Status\_Badge](http://www.r-pkg.org/badges/version/dplyr)](http://cran.r-project.org/package=dplyr) [![Coverage Status](https://img.shields.io/codecov/c/github/hadley/dplyr/master.svg)](https://codecov.io/github/hadley/dplyr?branch=master)
-
-dplyr is the next iteration of plyr, focussed on tools for working with data frames (hence the `d` in the name). It has three main goals:
-
-- Identify the most important data manipulation tools needed for data analysis and make them easy to use from R.
-
-- Provide blazing fast performance for in-memory data by writing key pieces in [C++](http://www.rcpp.org/).
-
-- Use the same interface to work with data no matter where it's stored, whether in a data frame, a data table or database.
-
-You can install:
-
-- the latest released version from CRAN with
-
- ``` r
- install.packages("dplyr")
- ```
-
-- the latest development version from github with
-
- ``` r
- if (packageVersion("devtools") < 1.6) {
- install.packages("devtools")
- }
- devtools::install_github("hadley/lazyeval")
- devtools::install_github("hadley/dplyr")
- ```
-You'll probably also want to install the data packages used in most examples: `install.packages(c("nycflights13", "Lahman"))`.
-
-If you encounter a clear bug, please file a minimal reproducible example on [github](https://github.com/hadley/dplyr/issues). For questions and other discussion, please use the [manipulatr mailing list](https://groups.google.com/group/manipulatr).
-
-Learning dplyr
---------------
-
-To get started, read the notes below, then read the intro vignette: `vignette("introduction", package = "dplyr")`. To make the most of dplyr, I also recommend that you familiarise yourself with the principles of [tidy data](http://vita.had.co.nz/papers/tidy-data.html): this will help you get your data into a form that works well with dplyr, ggplot2 and R's many modelling functions.
-
-If you need more, help I recommend the following (paid) resources:
-
-- [dplyr](https://www.datacamp.com/courses/dplyr) on datacamp, by Garrett Grolemund. Learn the basics of dplyr at your own pace in this interactive online course.
-
-- [Introduction to Data Science with R](http://shop.oreilly.com/product/0636920034834.do): How to Manipulate, Visualize, and Model Data with the R Language, by Garrett Grolemund. This O'Reilly video series will teach you the basics needed to be an effective analyst in R.
-
-Key data structures
--------------------
-
-The key object in dplyr is a *tbl*, a representation of a tabular data structure. Currently `dplyr` supports:
-
-- data frames
-- [data tables](https://github.com/Rdatatable/data.table/wiki)
-- [SQLite](http://sqlite.org/)
-- [PostgreSQL](http://www.postgresql.org/)/[Redshift](http://aws.amazon.com/redshift/)
-- [MySQL](http://www.mysql.com/)/[MariaDB](https://mariadb.com/)
-- [Bigquery](https://developers.google.com/bigquery/)
-- [MonetDB](http://www.monetdb.org/)
-- data cubes with arrays (partial implementation)
+<!-- README.md is generated from README.Rmd. Please edit that file -->
+dplyr <img src="man/figures/logo.png" align="right" />
+======================================================
-You can create them as follows:
+[![Build Status](https://travis-ci.org/tidyverse/dplyr.svg?branch=master)](https://travis-ci.org/tidyverse/dplyr) [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/tidyverse/dplyr?branch=master&svg=true)](https://ci.appveyor.com/project/tidyverse/dplyr) [![CRAN\_Status\_Badge](http://www.r-pkg.org/badges/version/dplyr)](http://cran.r-project.org/package=dplyr) [![Coverage Status](https://codecov.io/gh/tidyverse/dplyr/branch/master/graph/badge.svg)](https://code [...]
-``` r
-library(dplyr) # for functions
-library(nycflights13) # for data
-flights
-#> Source: local data frame [336,776 x 16]
-#>
-#> year month day dep_time dep_delay arr_time arr_delay carrier tailnum
-#> (int) (int) (int) (int) (dbl) (int) (dbl) (chr) (chr)
-#> 1 2013 1 1 517 2 830 11 UA N14228
-#> 2 2013 1 1 533 4 850 20 UA N24211
-#> 3 2013 1 1 542 2 923 33 AA N619AA
-#> 4 2013 1 1 544 -1 1004 -18 B6 N804JB
-#> 5 2013 1 1 554 -6 812 -25 DL N668DN
-#> 6 2013 1 1 554 -4 740 12 UA N39463
-#> 7 2013 1 1 555 -5 913 19 B6 N516JB
-#> 8 2013 1 1 557 -3 709 -14 EV N829AS
-#> 9 2013 1 1 557 -3 838 -8 B6 N593JB
-#> 10 2013 1 1 558 -2 753 8 AA N3ALAA
-#> .. ... ... ... ... ... ... ... ... ...
-#> Variables not shown: flight (int), origin (chr), dest (chr), air_time
-#> (dbl), distance (dbl), hour (dbl), minute (dbl).
-
-# Caches data in local SQLite db
-flights_db1 <- tbl(nycflights13_sqlite(), "flights")
-
-# Caches data in local postgres db
-flights_db2 <- tbl(nycflights13_postgres(), "flights")
-```
+Overview
+--------
-Each tbl also comes in a grouped variant which allows you to easily perform operations "by group":
+dplyr is a grammar of data manipulation, providing a consistent set of verbs that help you solve the most common data manipulation challenges:
-``` r
-carriers_df <- flights %>% group_by(carrier)
-carriers_db1 <- flights_db1 %>% group_by(carrier)
-carriers_db2 <- flights_db2 %>% group_by(carrier)
-```
+- `mutate()` adds new variables that are functions of existing variables
+- `select()` picks variables based on their names.
+- `filter()` picks cases based on their values.
+- `summarise()` reduces multiple values down to a single summary.
+- `arrange()` changes the ordering of the rows.
-Single table verbs
-------------------
+These all combine naturally with `group_by()` which allows you to perform any operation "by group". You can learn more about them in `vignette("dplyr")`. As well as these single-table verbs, dplyr also provides a variety of two-table verbs, which you can learn about in `vignette("two-table")`.
-`dplyr` implements the following verbs useful for data manipulation:
+dplyr is designed to abstract over how the data is stored. That means as well as working with local data frames, you can also work with remote database tables, using exactly the same R code. Install the dbplyr package then read `vignette("databases", package = "dbplyr")`.
-- `select()`: focus on a subset of variables
-- `filter()`: focus on a subset of rows
-- `mutate()`: add new columns
-- `summarise()`: reduce each group to a smaller number of summary statistics
-- `arrange()`: re-order the rows
+If you are new to dplyr, the best place to start is the [data import chapter](http://r4ds.had.co.nz/transform.html) in R for data science.
-They all work as similarly as possible across the range of data sources. The main difference is performance:
+Installation
+------------
``` r
-system.time(carriers_df %>% summarise(delay = mean(arr_delay)))
-#> user system elapsed
-#> 0.040 0.001 0.043
-system.time(carriers_db1 %>% summarise(delay = mean(arr_delay)) %>% collect())
-#> user system elapsed
-#> 0.348 0.302 1.280
-system.time(carriers_db2 %>% summarise(delay = mean(arr_delay)) %>% collect())
-#> user system elapsed
-#> 0.015 0.000 0.142
-```
+# The easiest way to get dplyr is to install the whole tidyverse:
+install.packages("tidyverse")
-Data frame methods are much much faster than the plyr equivalent. The database methods are slower, but can work with data that don't fit in memory.
+# Alternatively, install just dplyr:
+install.packages("dplyr")
-``` r
-system.time(plyr::ddply(flights, "carrier", plyr::summarise,
- delay = mean(arr_delay, na.rm = TRUE)))
-#> user system elapsed
-#> 0.104 0.029 0.134
+# Or the development version from GitHub:
+# install.packages("devtools")
+devtools::install_github("tidyverse/dplyr")
```
-### `do()`
-
-As well as the specialised operations described above, `dplyr` also provides the generic `do()` function which applies any R function to each group of the data.
-
-Let's take the batting database from the built-in Lahman database. We'll group it by year, and then fit a model to explore the relationship between their number of at bats and runs:
-
-``` r
-by_year <- lahman_df() %>%
- tbl("Batting") %>%
- group_by(yearID)
-by_year %>%
- do(mod = lm(R ~ AB, data = .))
-#> Source: local data frame [144 x 2]
-#> Groups: <by row>
-#>
-#> yearID mod
-#> (int) (list)
-#> 1 1871 <S3:lm>
-#> 2 1872 <S3:lm>
-#> 3 1873 <S3:lm>
-#> 4 1874 <S3:lm>
-#> 5 1875 <S3:lm>
-#> 6 1876 <S3:lm>
-#> 7 1877 <S3:lm>
-#> 8 1878 <S3:lm>
-#> 9 1879 <S3:lm>
-#> 10 1880 <S3:lm>
-#> .. ... ...
-```
+If you encounter a clear bug, please file a minimal reproducible example on [github](https://github.com/tidyverse/dplyr/issues). For questions and other discussion, please use the [manipulatr mailing list](https://groups.google.com/group/manipulatr).
-Note that if you are fitting lots of linear models, it's a good idea to use `biglm` because it creates model objects that are considerably smaller:
+Usage
+-----
``` r
-by_year %>%
- do(mod = lm(R ~ AB, data = .)) %>%
- object.size() %>%
- print(unit = "MB")
-#> 22.7 Mb
-
-by_year %>%
- do(mod = biglm::biglm(R ~ AB, data = .)) %>%
- object.size() %>%
- print(unit = "MB")
-#> 0.8 Mb
+library(dplyr)
+
+starwars %>%
+ filter(species == "Droid")
+#> # A tibble: 5 x 13
+#> name height mass hair… skin… eye_… birt… gend… home… spec… films vehi…
+#> <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <lis> <lis>
+#> 1 C-3PO 167 75.0 <NA> gold yell… 112 <NA> Tato… Droid <chr… <chr…
+#> 2 R2-D2 96 32.0 <NA> "whi… red 33.0 <NA> Naboo Droid <chr… <chr…
+#> 3 R5-D4 97 32.0 <NA> "whi… red NA <NA> Tato… Droid <chr… <chr…
+#> 4 IG-88 200 140 none metal red 15.0 none <NA> Droid <chr… <chr…
+#> 5 BB8 NA NA none none black NA none <NA> Droid <chr… <chr…
+#> # ... with 1 more variable: starships <list>
+
+starwars %>%
+ select(name, ends_with("color"))
+#> # A tibble: 87 x 4
+#> name hair_color skin_color eye_color
+#> <chr> <chr> <chr> <chr>
+#> 1 "Luke Skywalker" blond fair blue
+#> 2 C-3PO <NA> gold yellow
+#> 3 R2-D2 <NA> "white, blue" red
+#> 4 "Darth Vader" none white yellow
+#> 5 "Leia Organa" brown light brown
+#> # ... with 82 more rows
+
+starwars %>%
+ mutate(name, bmi = mass / ((height / 100) ^ 2)) %>%
+ select(name:mass, bmi)
+#> # A tibble: 87 x 4
+#> name height mass bmi
+#> <chr> <int> <dbl> <dbl>
+#> 1 "Luke Skywalker" 172 77.0 26.0
+#> 2 C-3PO 167 75.0 26.9
+#> 3 R2-D2 96 32.0 34.7
+#> 4 "Darth Vader" 202 136 33.3
+#> 5 "Leia Organa" 150 49.0 21.8
+#> # ... with 82 more rows
+
+starwars %>%
+ arrange(desc(mass))
+#> # A tibble: 87 x 13
+#> name heig… mass hair… skin… eye_… birt… gend… home… spec… films vehi…
+#> <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <lis> <lis>
+#> 1 "Jabb… 175 1358 <NA> "gre… oran… 600 herm… "Nal… Hutt <chr… <chr…
+#> 2 Griev… 216 159 none "bro… "gre… NA male Kalee Kale… <chr… <chr…
+#> 3 IG-88 200 140 none metal red 15.0 none <NA> Droid <chr… <chr…
+#> 4 "Dart… 202 136 none white yell… 41.9 male Tato… Human <chr… <chr…
+#> 5 Tarff… 234 136 brown brown blue NA male Kash… Wook… <chr… <chr…
+#> # ... with 82 more rows, and 1 more variable: starships <list>
+
+starwars %>%
+ group_by(species) %>%
+ summarise(
+ n = n(),
+ mass = mean(mass, na.rm = TRUE)
+ ) %>%
+ filter(n > 1)
+#> # A tibble: 9 x 3
+#> species n mass
+#> <chr> <int> <dbl>
+#> 1 Droid 5 69.8
+#> 2 Gungan 3 74.0
+#> 3 Human 35 82.8
+#> 4 Kaminoan 2 88.0
+#> 5 Mirialan 2 53.1
+#> # ... with 4 more rows
```
-
-Multiple table verbs
---------------------
-
-As well as verbs that work on a single tbl, there are also a set of useful verbs that work with two tbls at a time: joins and set operations.
-
-dplyr implements the four most useful joins from SQL:
-
-- `inner_join(x, y)`: matching x + y
-- `left_join(x, y)`: all x + matching y
-- `semi_join(x, y)`: all x with match in y
-- `anti_join(x, y)`: all x without match in y
-
-And provides methods for:
-
-- `intersect(x, y)`: all rows in both x and y
-- `union(x, y)`: rows in either x or y
-- `setdiff(x, y)`: rows in x, but not y
-
-Plyr compatibility
-------------------
-
-You'll need to be a little careful if you load both plyr and dplyr at the same time. I'd recommend loading plyr first, then dplyr, so that the faster dplyr functions come first in the search path. By and large, any function provided by both dplyr and plyr works in a similar way, although dplyr functions tend to be faster and more general.
-
-Related approaches
-------------------
-
-- [Blaze](http://blaze.pydata.org)
-- [|Stat](http://oldwww.acm.org/perlman/stat/)
-- [Pig](http://dx.doi.org/10.1145/1376616.1376726)
diff --git a/build/vignette.rds b/build/vignette.rds
index 19d0805..d5f4173 100644
Binary files a/build/vignette.rds and b/build/vignette.rds differ
diff --git a/data/band_instruments.rda b/data/band_instruments.rda
new file mode 100644
index 0000000..813150f
Binary files /dev/null and b/data/band_instruments.rda differ
diff --git a/data/band_instruments2.rda b/data/band_instruments2.rda
new file mode 100644
index 0000000..676cf56
Binary files /dev/null and b/data/band_instruments2.rda differ
diff --git a/data/band_members.rda b/data/band_members.rda
new file mode 100644
index 0000000..31eb286
Binary files /dev/null and b/data/band_members.rda differ
diff --git a/data/starwars.rda b/data/starwars.rda
new file mode 100644
index 0000000..25fd475
Binary files /dev/null and b/data/starwars.rda differ
diff --git a/data/storms.rda b/data/storms.rda
new file mode 100644
index 0000000..7fc6fc5
Binary files /dev/null and b/data/storms.rda differ
diff --git a/inst/doc/compatibility.R b/inst/doc/compatibility.R
new file mode 100644
index 0000000..bc3f1d7
--- /dev/null
+++ b/inst/doc/compatibility.R
@@ -0,0 +1,114 @@
+## ----setup, include = FALSE----------------------------------------------
+library(dplyr)
+knitr::opts_chunk$set(collapse = T, comment = "#>")
+
+## ---- results = "hide"---------------------------------------------------
+if (utils::packageVersion("dplyr") > "0.5.0") {
+ # code for new version
+} else {
+ # code for old version
+}
+
+## ---- eval = FALSE-------------------------------------------------------
+# if (utils::packageVersion("dplyr") > "0.5.0") {
+# dbplyr::build_sql(...)
+# } else {
+# dplyr::build_sql(...)
+# }
+
+## ------------------------------------------------------------------------
+#' @rawNamespace
+#' if (utils::packageVersion("dplyr") > "0.5.0") {
+#' importFrom("dbplyr", "build_sql")
+#' } else {
+#' importFrom("dplyr", "build_sql")
+#' }
+
+## ---- eval = FALSE-------------------------------------------------------
+# wrap_dbplyr_obj("build_sql")
+#
+# wrap_dbplyr_obj("base_agg")
+
+## ---- eval = FALSE-------------------------------------------------------
+# quo <- quo(cyl)
+# select(mtcars, !! quo)
+
+## ---- results = "hide"---------------------------------------------------
+sym <- quote(cyl)
+select(mtcars, !! sym)
+
+call <- quote(mean(cyl))
+summarise(mtcars, !! call)
+
+## ------------------------------------------------------------------------
+quo(!! sym)
+quo(!! call)
+
+rlang::as_quosure(sym)
+rlang::as_quosure(call)
+
+## ------------------------------------------------------------------------
+f <- ~cyl
+f
+rlang::as_quosure(f)
+
+## ------------------------------------------------------------------------
+rlang::sym("cyl")
+rlang::syms(letters[1:3])
+
+## ------------------------------------------------------------------------
+syms <- rlang::syms(c("foo", "bar", "baz"))
+quo(my_call(!!! syms))
+
+fun <- rlang::sym("my_call")
+quo(UQ(fun)(!!! syms))
+
+## ------------------------------------------------------------------------
+call <- rlang::lang("my_call", !!! syms)
+call
+
+rlang::as_quosure(call)
+
+# Or equivalently:
+quo(!! rlang::lang("my_call", !!! syms))
+
+## ---- eval=FALSE---------------------------------------------------------
+# lazyeval::interp(~ mean(var), var = rlang::sym("mpg"))
+
+## ---- eval=FALSE---------------------------------------------------------
+# var <- "mpg"
+# quo(mean(!! rlang::sym(var)))
+
+## ---- eval = FALSE-------------------------------------------------------
+# filter_.tbl_df <- function(.data, ..., .dots = list()) {
+# dots <- compat_lazy_dots(.dots, caller_env(), ...)
+# filter(.data, !!! dots)
+# }
+
+## ---- eval = FALSE-------------------------------------------------------
+# filter.default <- function(.data, ...) {
+# filter_(.data, .dots = compat_as_lazy_dots(...))
+# }
+
+## ---- eval = FALSE-------------------------------------------------------
+# filter.sf <- function(.data, ...) {
+# st_as_sf(NextMethod())
+# }
+
+## ---- eval = FALSE-------------------------------------------------------
+# mutate_each(starwars, funs(as.character))
+# mutate_all(starwars, funs(as.character))
+
+## ---- eval = FALSE-------------------------------------------------------
+# mutate_all(starwars, as.character)
+
+## ---- eval = FALSE-------------------------------------------------------
+# mutate_each(starwars, funs(as.character), height, mass)
+# mutate_at(starwars, vars(height, mass), as.character)
+
+## ---- eval = FALSE-------------------------------------------------------
+# summarise_at(mtcars, vars(starts_with("d")), mean)
+
+## ---- eval = FALSE-------------------------------------------------------
+# mutate_at(starwars, c("height", "mass"), as.character)
+
diff --git a/inst/doc/compatibility.Rmd b/inst/doc/compatibility.Rmd
new file mode 100644
index 0000000..76e8662
--- /dev/null
+++ b/inst/doc/compatibility.Rmd
@@ -0,0 +1,288 @@
+---
+title: "dplyr compatibility"
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{dplyr compatibility}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteEncoding{UTF-8}
+---
+
+```{r setup, include = FALSE}
+library(dplyr)
+knitr::opts_chunk$set(collapse = T, comment = "#>")
+```
+
+This vignette is aimed at package authors who need to update their code because of a backward incompatible change to dplyr. We do try and minimise backward incompatible changes as much as possible, but sometimes they are necessary in order to radically simplify existing code, or unlock a lot of potential value in the future.
+
+This vignette starts with some general advice on writing package code that works with multiple version of dplyr, then continues to discuss specific changes in dplyr versions.
+
+## Working with multiple dplyr versions
+
+Ideally, you want to make sure that your package works with both the released version and the development version of dplyr. This is typically a little bit more work, but has two big advantages:
+
+1. It's more convenient for your users, since they're not forced to update
+ dplyr if they don't want to)
+
+1. It's easier on CRAN since it doesn't require a massive coordinated release
+ of multiple packages.
+
+To make code work with multiple versions of a package, your first tool is the simple if statement:
+
+```{r, results = "hide"}
+if (utils::packageVersion("dplyr") > "0.5.0") {
+ # code for new version
+} else {
+ # code for old version
+}
+```
+
+Always condition on `> current-version`, not `>= next-version` because this will ensure that this branch is also used for the development version of the package. For example, if the current release is version "0.5.0", the development version will be "0.5.0.9000".
+
+Occasionally, you'll run into a situation where the `NAMESPACE` has changed and you need to conditionally import different functions. This typically occurs when functions are moved from one package to another. We try out best to provide automatic fallbacks, but this is not always possible. Often you can work around the problem by avoiding `importFrom` and using `::` instead. Do this where possible:
+
+```{r, eval = FALSE}
+if (utils::packageVersion("dplyr") > "0.5.0") {
+ dbplyr::build_sql(...)
+} else {
+ dplyr::build_sql(...)
+}
+```
+
+This will generate an `R CMD check` NOTE (because the one of the functions will always be missing), but this is ok. Simply explain that you get the note because you have written a wrapper to make sure your code is backward compatible.
+
+Sometimes it's not possible to avoid `importFrom()`. For example you might be importing a generic so that you can define a method for it. In this case, you can take advantage of a little-known feature in the `NAMESPACE` file: you can include `if` statements.
+
+```{r}
+#' @rawNamespace
+#' if (utils::packageVersion("dplyr") > "0.5.0") {
+#' importFrom("dbplyr", "build_sql")
+#' } else {
+#' importFrom("dplyr", "build_sql")
+#' }
+```
+
+## dplyr 0.6.0
+
+### Database code moves to dbplyr
+
+Almost all database related code has been moved out of dplyr and into a new package, [dbplyr](http://github.com/hadley/dbplyr/). This makes dplyr simpler, and will make it easier to release fixes for bugs that only affect databases. If you've implemented a database backend for dplyr, please read the [backend news](https://github.com/hadley/dbplyr/blob/master/NEWS.md#backends) on the backend.
+
+Depending on what generics you use, and what generics you provide methods for you, you may need to write some conditional code. To help make this easier we've written `wrap_dbplyr_obj()` which will write the helper code for you:
+
+```{r, eval = FALSE}
+wrap_dbplyr_obj("build_sql")
+
+wrap_dbplyr_obj("base_agg")
+```
+
+Simply copy the results of this function in your package.
+
+These will generate `R CMD check` NOTES, so make sure to tell CRAN that this is to ensure backward compatibility.
+
+
+### Deprecation of underscored `verbs_()`
+
+Because the tidyeval framework allows us to combine SE and NSE
+semantics within the same functions, the underscored verbs have been
+softly deprecated.
+
+
+#### For users of SE_ verbs
+
+The legacy underscored versions take objects for which a
+`lazyeval::as.lazy()` method is defined. This includes symbols and
+calls, strings, and formulas. All of these objects have been replaced
+with quosures and you can call tidyeval verbs with unquoted quosures:
+
+```{r, eval = FALSE}
+quo <- quo(cyl)
+select(mtcars, !! quo)
+```
+
+Symbolic expressions are also supported, but note that bare symbols
+and calls do not carry scope information. If you're referring to
+objects in the data frame, it's safe to omit specifying an enclosure:
+
+```{r, results = "hide"}
+sym <- quote(cyl)
+select(mtcars, !! sym)
+
+call <- quote(mean(cyl))
+summarise(mtcars, !! call)
+```
+
+Transforming objects into quosures is generally straightforward. To
+enclose with the current environment, you can unquote directly in
+`quo()` or you can use `as_quosure()`:
+
+```{r}
+quo(!! sym)
+quo(!! call)
+
+rlang::as_quosure(sym)
+rlang::as_quosure(call)
+```
+
+Note that while formulas and quosures are very similar objects (and in
+the most general sense, formulas are quosures), they can't be used
+interchangeably in tidyeval functions. Early implementations did treat
+bare formulas as quosures, but this created compatibility issues with
+modelling functions of the stats package. Fortunately, it's easy to
+transform formulas to quosures that will self-evaluate in tidyeval
+functions:
+
+```{r}
+f <- ~cyl
+f
+rlang::as_quosure(f)
+```
+
+Finally, and perhaps most importantly, **strings are not and should
+not be parsed**. As developers, it is tempting to try and solve
+problems using strings because we have been trained to work with
+strings rather than quoted expressions. However it's almost always the
+wrong way to approach the problem. The exception is for creating
+symbols. In that case it is perfectly legitimate to use strings:
+
+```{r}
+rlang::sym("cyl")
+rlang::syms(letters[1:3])
+```
+
+But you should never use strings to create calls. Instead you can use
+quasiquotation:
+
+```{r}
+syms <- rlang::syms(c("foo", "bar", "baz"))
+quo(my_call(!!! syms))
+
+fun <- rlang::sym("my_call")
+quo(UQ(fun)(!!! syms))
+```
+
+Or create the call with `lang()`:
+
+```{r}
+call <- rlang::lang("my_call", !!! syms)
+call
+
+rlang::as_quosure(call)
+
+# Or equivalently:
+quo(!! rlang::lang("my_call", !!! syms))
+```
+
+Note that idioms based on `interp()` should now generally be avoided
+and replaced with quasiquotation. Where you used to interpolate:
+
+```{r, eval=FALSE}
+lazyeval::interp(~ mean(var), var = rlang::sym("mpg"))
+```
+
+You would now unquote:
+
+```{r, eval=FALSE}
+var <- "mpg"
+quo(mean(!! rlang::sym(var)))
+```
+
+See also `vignette("programming")` for more about quasiquotation and
+quosures.
+
+
+#### For package authors
+
+For package authors, rlang provides a
+[compatibility file](https://github.com/hadley/rlang/blob/master/R/compat-lazyeval.R) that
+you can copy to your package. `compat_lazy()` and `compat_lazy_dots()`
+turn lazy-able objects into proper quosures. This helps providing an
+underscored version to your users for backward compatibility. For
+instance, here is how we defined the underscored version of `filter()`
+in dplyr 0.6:
+
+```{r, eval = FALSE}
+filter_.tbl_df <- function(.data, ..., .dots = list()) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
+ filter(.data, !!! dots)
+}
+```
+
+With tidyeval, S3 dispatch to the correct method might be an issue. In
+the past, the genericity of dplyr verbs was accomplished by
+dispatching in the underscored versions. Now that those are
+deprecated, we've turned the non-underscored verbs into S3 generics.
+
+We maintain backward compatibility by redispatching to old underscored
+verbs in the default methods of the new S3 generics. For example, here
+is how we redispatch `filter()`:
+
+```{r, eval = FALSE}
+filter.default <- function(.data, ...) {
+ filter_(.data, .dots = compat_as_lazy_dots(...))
+}
+```
+
+This gets the job done in most cases. However, the default method will
+not be called for objects inheriting from one of the classes for which
+we provide non-underscored methods: `data.frame`, `tbl_df`, `tbl_cube`
+and `grouped_df`. An example of this is the `sf` package whose objects
+have classes `c("sf", "data.frame")`. Authors of such packages should
+provide a method for the non-underscored generic in order to be
+compatible with dplyr:
+
+```{r, eval = FALSE}
+filter.sf <- function(.data, ...) {
+ st_as_sf(NextMethod())
+}
+```
+
+If you need help with this, please let us know!
+
+
+### Deprecation of `mutate_each()` and `summarise_each()`
+
+These functions have been replaced by a more complete family of
+functions. This family has suffixes `_if`, `_at` and `_all` and
+includes more verbs than just `mutate` `summarise`.
+
+If you need to update your code to the new family, there are two
+relevant functions depending on which variables you apply `funs()` to.
+If you called `mutate_each()` without supplying a selection of
+variables, `funs` is applied to all variables. In this case, you
+should update your code to use `mutate_all()` instead:
+
+```{r, eval = FALSE}
+mutate_each(starwars, funs(as.character))
+mutate_all(starwars, funs(as.character))
+```
+
+Note that the new verbs support bare functions as well, so you don't
+necessarily need to wrap with `funs()`:
+
+```{r, eval = FALSE}
+mutate_all(starwars, as.character)
+```
+
+
+On the other hand, if you supplied a variable selection, you should
+use `mutate_at()`. The variable selection should be wrapped with
+`vars()`.
+
+```{r, eval = FALSE}
+mutate_each(starwars, funs(as.character), height, mass)
+mutate_at(starwars, vars(height, mass), as.character)
+```
+
+`vars()` supports all the selection helpers that you usually use with
+`select()`:
+
+```{r, eval = FALSE}
+summarise_at(mtcars, vars(starts_with("d")), mean)
+```
+
+Note that intead of a `vars()` selection, you can also supply
+character vectors of column names:
+
+```{r, eval = FALSE}
+mutate_at(starwars, c("height", "mass"), as.character)
+```
diff --git a/inst/doc/compatibility.html b/inst/doc/compatibility.html
new file mode 100644
index 0000000..75afe12
--- /dev/null
+++ b/inst/doc/compatibility.html
@@ -0,0 +1,243 @@
+<!DOCTYPE html>
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+
+<head>
+
+<meta charset="utf-8" />
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+<meta name="generator" content="pandoc" />
+
+<meta name="viewport" content="width=device-width, initial-scale=1">
+
+
+
+<title>dplyr compatibility</title>
+
+
+
+<style type="text/css">code{white-space: pre;}</style>
+<style type="text/css">
+div.sourceCode { overflow-x: auto; }
+table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode {
+ margin: 0; padding: 0; vertical-align: baseline; border: none; }
+table.sourceCode { width: 100%; line-height: 100%; }
+td.lineNumbers { text-align: right; padding-right: 4px; padding-left: 4px; color: #aaaaaa; border-right: 1px solid #aaaaaa; }
+td.sourceCode { padding-left: 5px; }
+code > span.kw { color: #007020; font-weight: bold; } /* Keyword */
+code > span.dt { color: #902000; } /* DataType */
+code > span.dv { color: #40a070; } /* DecVal */
+code > span.bn { color: #40a070; } /* BaseN */
+code > span.fl { color: #40a070; } /* Float */
+code > span.ch { color: #4070a0; } /* Char */
+code > span.st { color: #4070a0; } /* String */
+code > span.co { color: #60a0b0; font-style: italic; } /* Comment */
+code > span.ot { color: #007020; } /* Other */
+code > span.al { color: #ff0000; font-weight: bold; } /* Alert */
+code > span.fu { color: #06287e; } /* Function */
+code > span.er { color: #ff0000; font-weight: bold; } /* Error */
+code > span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
+code > span.cn { color: #880000; } /* Constant */
+code > span.sc { color: #4070a0; } /* SpecialChar */
+code > span.vs { color: #4070a0; } /* VerbatimString */
+code > span.ss { color: #bb6688; } /* SpecialString */
+code > span.im { } /* Import */
+code > span.va { color: #19177c; } /* Variable */
+code > span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
+code > span.op { color: #666666; } /* Operator */
+code > span.bu { } /* BuiltIn */
+code > span.ex { } /* Extension */
+code > span.pp { color: #bc7a00; } /* Preprocessor */
+code > span.at { color: #7d9029; } /* Attribute */
+code > span.do { color: #ba2121; font-style: italic; } /* Documentation */
+code > span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
+code > span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
+code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
+</style>
+
+
+
+<link href="data:text/css;charset=utf-8,body%20%7B%0Abackground%2Dcolor%3A%20%23fff%3B%0Amargin%3A%201em%20auto%3B%0Amax%2Dwidth%3A%20700px%3B%0Aoverflow%3A%20visible%3B%0Apadding%2Dleft%3A%202em%3B%0Apadding%2Dright%3A%202em%3B%0Afont%2Dfamily%3A%20%22Open%20Sans%22%2C%20%22Helvetica%20Neue%22%2C%20Helvetica%2C%20Arial%2C%20sans%2Dserif%3B%0Afont%2Dsize%3A%2014px%3B%0Aline%2Dheight%3A%201%2E35%3B%0A%7D%0A%23header%20%7B%0Atext%2Dalign%3A%20center%3B%0A%7D%0A%23TOC%20%7B%0Aclear%3A%20bot [...]
+
+</head>
+
+<body>
+
+
+
+
+<h1 class="title toc-ignore">dplyr compatibility</h1>
+
+
+
+<p>This vignette is aimed at package authors who need to update their code because of a backward incompatible change to dplyr. We do try and minimise backward incompatible changes as much as possible, but sometimes they are necessary in order to radically simplify existing code, or unlock a lot of potential value in the future.</p>
+<p>This vignette starts with some general advice on writing package code that works with multiple version of dplyr, then continues to discuss specific changes in dplyr versions.</p>
+<div id="working-with-multiple-dplyr-versions" class="section level2">
+<h2>Working with multiple dplyr versions</h2>
+<p>Ideally, you want to make sure that your package works with both the released version and the development version of dplyr. This is typically a little bit more work, but has two big advantages:</p>
+<ol style="list-style-type: decimal">
+<li><p>It’s more convenient for your users, since they’re not forced to update dplyr if they don’t want to)</p></li>
+<li><p>It’s easier on CRAN since it doesn’t require a massive coordinated release of multiple packages.</p></li>
+</ol>
+<p>To make code work with multiple versions of a package, your first tool is the simple if statement:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="cf">if</span> (utils<span class="op">::</span><span class="kw">packageVersion</span>(<span class="st">"dplyr"</span>) <span class="op">></span><span class="st"> "0.5.0"</span>) {
+ <span class="co"># code for new version</span>
+} <span class="cf">else</span> {
+ <span class="co"># code for old version</span>
+}</code></pre></div>
+<p>Always condition on <code>> current-version</code>, not <code>>= next-version</code> because this will ensure that this branch is also used for the development version of the package. For example, if the current release is version “0.5.0”, the development version will be “0.5.0.9000”.</p>
+<p>Occasionally, you’ll run into a situation where the <code>NAMESPACE</code> has changed and you need to conditionally import different functions. This typically occurs when functions are moved from one package to another. We try out best to provide automatic fallbacks, but this is not always possible. Often you can work around the problem by avoiding <code>importFrom</code> and using <code>::</code> instead. Do this where possible:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="cf">if</span> (utils<span class="op">::</span><span class="kw">packageVersion</span>(<span class="st">"dplyr"</span>) <span class="op">></span><span class="st"> "0.5.0"</span>) {
+ dbplyr<span class="op">::</span><span class="kw">build_sql</span>(...)
+} <span class="cf">else</span> {
+ dplyr<span class="op">::</span><span class="kw">build_sql</span>(...)
+}</code></pre></div>
+<p>This will generate an <code>R CMD check</code> NOTE (because the one of the functions will always be missing), but this is ok. Simply explain that you get the note because you have written a wrapper to make sure your code is backward compatible.</p>
+<p>Sometimes it’s not possible to avoid <code>importFrom()</code>. For example you might be importing a generic so that you can define a method for it. In this case, you can take advantage of a little-known feature in the <code>NAMESPACE</code> file: you can include <code>if</code> statements.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co">#' @rawNamespace</span>
+<span class="co">#' if (utils::packageVersion("dplyr") > "0.5.0") {</span>
+<span class="co">#' importFrom("dbplyr", "build_sql")</span>
+<span class="co">#' } else {</span>
+<span class="co">#' importFrom("dplyr", "build_sql")</span>
+<span class="co">#' }</span></code></pre></div>
+</div>
+<div id="dplyr-0.6.0" class="section level2">
+<h2>dplyr 0.6.0</h2>
+<div id="database-code-moves-to-dbplyr" class="section level3">
+<h3>Database code moves to dbplyr</h3>
+<p>Almost all database related code has been moved out of dplyr and into a new package, <a href="http://github.com/hadley/dbplyr/">dbplyr</a>. This makes dplyr simpler, and will make it easier to release fixes for bugs that only affect databases. If you’ve implemented a database backend for dplyr, please read the <a href="https://github.com/hadley/dbplyr/blob/master/NEWS.md#backends">backend news</a> on the backend.</p>
+<p>Depending on what generics you use, and what generics you provide methods for you, you may need to write some conditional code. To help make this easier we’ve written <code>wrap_dbplyr_obj()</code> which will write the helper code for you:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">wrap_dbplyr_obj</span>(<span class="st">"build_sql"</span>)
+
+<span class="kw">wrap_dbplyr_obj</span>(<span class="st">"base_agg"</span>)</code></pre></div>
+<p>Simply copy the results of this function in your package.</p>
+<p>These will generate <code>R CMD check</code> NOTES, so make sure to tell CRAN that this is to ensure backward compatibility.</p>
+</div>
+<div id="deprecation-of-underscored-verbs_" class="section level3">
+<h3>Deprecation of underscored <code>verbs_()</code></h3>
+<p>Because the tidyeval framework allows us to combine SE and NSE semantics within the same functions, the underscored verbs have been softly deprecated.</p>
+<div id="for-users-of-se_-verbs" class="section level4">
+<h4>For users of SE_ verbs</h4>
+<p>The legacy underscored versions take objects for which a <code>lazyeval::as.lazy()</code> method is defined. This includes symbols and calls, strings, and formulas. All of these objects have been replaced with quosures and you can call tidyeval verbs with unquoted quosures:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">quo <-<span class="st"> </span><span class="kw">quo</span>(cyl)
+<span class="kw">select</span>(mtcars, <span class="op">!!</span><span class="st"> </span>quo)</code></pre></div>
+<p>Symbolic expressions are also supported, but note that bare symbols and calls do not carry scope information. If you’re referring to objects in the data frame, it’s safe to omit specifying an enclosure:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">sym <-<span class="st"> </span><span class="kw">quote</span>(cyl)
+<span class="kw">select</span>(mtcars, <span class="op">!!</span><span class="st"> </span>sym)
+
+call <-<span class="st"> </span><span class="kw">quote</span>(<span class="kw">mean</span>(cyl))
+<span class="kw">summarise</span>(mtcars, <span class="op">!!</span><span class="st"> </span>call)</code></pre></div>
+<p>Transforming objects into quosures is generally straightforward. To enclose with the current environment, you can unquote directly in <code>quo()</code> or you can use <code>as_quosure()</code>:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">quo</span>(<span class="op">!!</span><span class="st"> </span>sym)
+<span class="co">#> <quosure: global></span>
+<span class="co">#> ~cyl</span>
+<span class="kw">quo</span>(<span class="op">!!</span><span class="st"> </span>call)
+<span class="co">#> <quosure: global></span>
+<span class="co">#> ~mean(cyl)</span>
+
+rlang<span class="op">::</span><span class="kw">as_quosure</span>(sym)
+<span class="co">#> <quosure: global></span>
+<span class="co">#> ~cyl</span>
+rlang<span class="op">::</span><span class="kw">as_quosure</span>(call)
+<span class="co">#> <quosure: global></span>
+<span class="co">#> ~mean(cyl)</span></code></pre></div>
+<p>Note that while formulas and quosures are very similar objects (and in the most general sense, formulas are quosures), they can’t be used interchangeably in tidyeval functions. Early implementations did treat bare formulas as quosures, but this created compatibility issues with modelling functions of the stats package. Fortunately, it’s easy to transform formulas to quosures that will self-evaluate in tidyeval functions:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">f <-<span class="st"> </span><span class="er">~</span>cyl
+f
+<span class="co">#> ~cyl</span>
+rlang<span class="op">::</span><span class="kw">as_quosure</span>(f)
+<span class="co">#> <quosure: global></span>
+<span class="co">#> ~cyl</span></code></pre></div>
+<p>Finally, and perhaps most importantly, <strong>strings are not and should not be parsed</strong>. As developers, it is tempting to try and solve problems using strings because we have been trained to work with strings rather than quoted expressions. However it’s almost always the wrong way to approach the problem. The exception is for creating symbols. In that case it is perfectly legitimate to use strings:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">rlang<span class="op">::</span><span class="kw">sym</span>(<span class="st">"cyl"</span>)
+<span class="co">#> cyl</span>
+rlang<span class="op">::</span><span class="kw">syms</span>(letters[<span class="dv">1</span><span class="op">:</span><span class="dv">3</span>])
+<span class="co">#> [[1]]</span>
+<span class="co">#> a</span>
+<span class="co">#> </span>
+<span class="co">#> [[2]]</span>
+<span class="co">#> b</span>
+<span class="co">#> </span>
+<span class="co">#> [[3]]</span>
+<span class="co">#> c</span></code></pre></div>
+<p>But you should never use strings to create calls. Instead you can use quasiquotation:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">syms <-<span class="st"> </span>rlang<span class="op">::</span><span class="kw">syms</span>(<span class="kw">c</span>(<span class="st">"foo"</span>, <span class="st">"bar"</span>, <span class="st">"baz"</span>))
+<span class="kw">quo</span>(<span class="kw">my_call</span>(<span class="op">!!!</span><span class="st"> </span>syms))
+<span class="co">#> <quosure: global></span>
+<span class="co">#> ~my_call(foo, bar, baz)</span>
+
+fun <-<span class="st"> </span>rlang<span class="op">::</span><span class="kw">sym</span>(<span class="st">"my_call"</span>)
+<span class="kw">quo</span>(<span class="kw">UQ</span>(fun)(<span class="op">!!!</span><span class="st"> </span>syms))
+<span class="co">#> <quosure: global></span>
+<span class="co">#> ~my_call(foo, bar, baz)</span></code></pre></div>
+<p>Or create the call with <code>lang()</code>:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">call <-<span class="st"> </span>rlang<span class="op">::</span><span class="kw">lang</span>(<span class="st">"my_call"</span>, <span class="op">!!!</span><span class="st"> </span>syms)
+call
+<span class="co">#> my_call(foo, bar, baz)</span>
+
+rlang<span class="op">::</span><span class="kw">as_quosure</span>(call)
+<span class="co">#> <quosure: global></span>
+<span class="co">#> ~my_call(foo, bar, baz)</span>
+
+<span class="co"># Or equivalently:</span>
+<span class="kw">quo</span>(<span class="op">!!</span><span class="st"> </span>rlang<span class="op">::</span><span class="kw">lang</span>(<span class="st">"my_call"</span>, <span class="op">!!!</span><span class="st"> </span>syms))
+<span class="co">#> <quosure: global></span>
+<span class="co">#> ~my_call(foo, bar, baz)</span></code></pre></div>
+<p>Note that idioms based on <code>interp()</code> should now generally be avoided and replaced with quasiquotation. Where you used to interpolate:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">lazyeval<span class="op">::</span><span class="kw">interp</span>(<span class="op">~</span><span class="st"> </span><span class="kw">mean</span>(var), <span class="dt">var =</span> rlang<span class="op">::</span><span class="kw">sym</span>(<span class="st">"mpg"</span>))</code></pre></div>
+<p>You would now unquote:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">var <-<span class="st"> "mpg"</span>
+<span class="kw">quo</span>(<span class="kw">mean</span>(<span class="op">!!</span><span class="st"> </span>rlang<span class="op">::</span><span class="kw">sym</span>(var)))</code></pre></div>
+<p>See also <code>vignette("programming")</code> for more about quasiquotation and quosures.</p>
+</div>
+<div id="for-package-authors" class="section level4">
+<h4>For package authors</h4>
+<p>For package authors, rlang provides a <a href="https://github.com/hadley/rlang/blob/master/R/compat-lazyeval.R">compatibility file</a> that you can copy to your package. <code>compat_lazy()</code> and <code>compat_lazy_dots()</code> turn lazy-able objects into proper quosures. This helps providing an underscored version to your users for backward compatibility. For instance, here is how we defined the underscored version of <code>filter()</code> in dplyr 0.6:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">filter_.tbl_df <-<span class="st"> </span><span class="cf">function</span>(.data, ..., <span class="dt">.dots =</span> <span class="kw">list</span>()) {
+ dots <-<span class="st"> </span><span class="kw">compat_lazy_dots</span>(.dots, <span class="kw">caller_env</span>(), ...)
+ <span class="kw">filter</span>(.data, <span class="op">!!!</span><span class="st"> </span>dots)
+}</code></pre></div>
+<p>With tidyeval, S3 dispatch to the correct method might be an issue. In the past, the genericity of dplyr verbs was accomplished by dispatching in the underscored versions. Now that those are deprecated, we’ve turned the non-underscored verbs into S3 generics.</p>
+<p>We maintain backward compatibility by redispatching to old underscored verbs in the default methods of the new S3 generics. For example, here is how we redispatch <code>filter()</code>:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">filter.default <-<span class="st"> </span><span class="cf">function</span>(.data, ...) {
+ <span class="kw">filter_</span>(.data, <span class="dt">.dots =</span> <span class="kw">compat_as_lazy_dots</span>(...))
+}</code></pre></div>
+<p>This gets the job done in most cases. However, the default method will not be called for objects inheriting from one of the classes for which we provide non-underscored methods: <code>data.frame</code>, <code>tbl_df</code>, <code>tbl_cube</code> and <code>grouped_df</code>. An example of this is the <code>sf</code> package whose objects have classes <code>c("sf", "data.frame")</code>. Authors of such packages should provide a method for the non-underscored generic [...]
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">filter.sf <-<span class="st"> </span><span class="cf">function</span>(.data, ...) {
+ <span class="kw">st_as_sf</span>(<span class="kw">NextMethod</span>())
+}</code></pre></div>
+<p>If you need help with this, please let us know!</p>
+</div>
+</div>
+<div id="deprecation-of-mutate_each-and-summarise_each" class="section level3">
+<h3>Deprecation of <code>mutate_each()</code> and <code>summarise_each()</code></h3>
+<p>These functions have been replaced by a more complete family of functions. This family has suffixes <code>_if</code>, <code>_at</code> and <code>_all</code> and includes more verbs than just <code>mutate</code> <code>summarise</code>.</p>
+<p>If you need to update your code to the new family, there are two relevant functions depending on which variables you apply <code>funs()</code> to. If you called <code>mutate_each()</code> without supplying a selection of variables, <code>funs</code> is applied to all variables. In this case, you should update your code to use <code>mutate_all()</code> instead:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">mutate_each</span>(starwars, <span class="kw">funs</span>(as.character))
+<span class="kw">mutate_all</span>(starwars, <span class="kw">funs</span>(as.character))</code></pre></div>
+<p>Note that the new verbs support bare functions as well, so you don’t necessarily need to wrap with <code>funs()</code>:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">mutate_all</span>(starwars, as.character)</code></pre></div>
+<p>On the other hand, if you supplied a variable selection, you should use <code>mutate_at()</code>. The variable selection should be wrapped with <code>vars()</code>.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">mutate_each</span>(starwars, <span class="kw">funs</span>(as.character), height, mass)
+<span class="kw">mutate_at</span>(starwars, <span class="kw">vars</span>(height, mass), as.character)</code></pre></div>
+<p><code>vars()</code> supports all the selection helpers that you usually use with <code>select()</code>:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">summarise_at</span>(mtcars, <span class="kw">vars</span>(<span class="kw">starts_with</span>(<span class="st">"d"</span>)), mean)</code></pre></div>
+<p>Note that intead of a <code>vars()</code> selection, you can also supply character vectors of column names:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">mutate_at</span>(starwars, <span class="kw">c</span>(<span class="st">"height"</span>, <span class="st">"mass"</span>), as.character)</code></pre></div>
+</div>
+</div>
+
+
+
+<!-- dynamically load mathjax for compatibility with self-contained -->
+<script>
+ (function () {
+ var script = document.createElement("script");
+ script.type = "text/javascript";
+ script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
+ document.getElementsByTagName("head")[0].appendChild(script);
+ })();
+</script>
+
+</body>
+</html>
diff --git a/inst/doc/data_frames.R b/inst/doc/data_frames.R
deleted file mode 100644
index 4e7b3a3..0000000
--- a/inst/doc/data_frames.R
+++ /dev/null
@@ -1,23 +0,0 @@
-## ---- echo = FALSE, message = FALSE--------------------------------------
-knitr::opts_chunk$set(collapse = T, comment = "#>")
-options(tibble.print_min = 4L, tibble.print_max = 4L)
-library(dplyr)
-
-## ------------------------------------------------------------------------
-location(iris)
-
-## ------------------------------------------------------------------------
-iris2 <- iris
-location(iris2)
-
-## ------------------------------------------------------------------------
-changes(iris2, iris)
-
-## ------------------------------------------------------------------------
-iris2$Sepal.Length <- iris2$Sepal.Length * 2
-changes(iris, iris2)
-
-## ------------------------------------------------------------------------
-iris3 <- mutate(iris, Sepal.Length = Sepal.Length * 2)
-changes(iris3, iris)
-
diff --git a/inst/doc/data_frames.Rmd b/inst/doc/data_frames.Rmd
deleted file mode 100644
index a5303c5..0000000
--- a/inst/doc/data_frames.Rmd
+++ /dev/null
@@ -1,79 +0,0 @@
----
-title: "Data frame performance"
-date: "`r Sys.Date()`"
-output: rmarkdown::html_vignette
-vignette: >
- %\VignetteIndexEntry{Data frame performance}
- %\VignetteEngine{knitr::rmarkdown}
- \usepackage[utf8]{inputenc}
----
-
-```{r, echo = FALSE, message = FALSE}
-knitr::opts_chunk$set(collapse = T, comment = "#>")
-options(tibble.print_min = 4L, tibble.print_max = 4L)
-library(dplyr)
-```
-
-One of the reasons that dplyr is fast is that it's very careful about when to make copies. This section describes how this works, and gives you some useful tools for understanding the memory usage of data frames in R.
-
-The first tool we'll use is `dplyr::location()`. It tells us the memory location of three components of a data frame object:
-
-* the data frame itself
-* each column
-* each attribute
-
-```{r}
-location(iris)
-```
-
-It's useful to know the memory address, because if the address changes, then you'll know that R has made a copy. Copies are bad because they take time to create. This isn't usually a bottleneck if you have a few thousand values, but if you have millions or tens of millions of values it starts to take significant amounts of time. Unnecessary copies are also bad because they take up memory.
-
-R tries to avoid making copies where possible. For example, if you just assign `iris` to another variable, it continues to the point same location:
-
-```{r}
-iris2 <- iris
-location(iris2)
-```
-
-Rather than having to compare hard to read memory locations, we can instead use the `dplyr::changes()` function to highlights changes between two versions of a data frame. The code below shows us that `iris` and `iris2` are identical: both names point to the same location in memory.
-
-```{r}
-changes(iris2, iris)
-```
-
-What do you think happens if you modify a single column of `iris2`? In R 3.1.0 and above, R knows to modify only that one column and to leave the others pointing to their existing locations:
-
-```{r}
-iris2$Sepal.Length <- iris2$Sepal.Length * 2
-changes(iris, iris2)
-```
-
-(This was not the case prior to version 3.1.0, where R created a deep copy of the entire data frame.)
-
-dplyr is equally smart:
-
-```{r}
-iris3 <- mutate(iris, Sepal.Length = Sepal.Length * 2)
-changes(iris3, iris)
-```
-
-It creates only one new column while all the other columns continue to point at their original locations. You might notice that the attributes are still copied. However, this has little impact on performance. Because attributes are usually short vectors, the internal dplyr code needed to copy them is also considerably simpler.
-
-dplyr never makes copies unless it has to:
-
-* `tbl_df()` and `group_by()` don't copy columns
-
-* `select()` never copies columns, even when you rename them
-
-* `mutate()` never copies columns, except when you modify an existing column
-
-* `arrange()` must always copy all columns because you're changing the order of every one.
- This is an expensive operation for big data, but you can generally avoid
- it using the order argument to [window functions](window-functions.html)
-
-* `summarise()` creates new data, but it's usually at least an order of
- magnitude smaller than the original data.
-
-In short, dplyr lets you work with data frames with very little memory overhead.
-
-data.table takes this idea one step further: it provides functions that modify a data table in place. This avoids the need to make copies of pointers to existing columns and attributes, and speeds up operations when you have many columns. dplyr doesn't do this with data frames (although it could) because I think it's safer to keep data immutable: even if the resulting data frame shares practically all the data of the original data frame, all dplyr data frame methods return a new data frame.
diff --git a/inst/doc/databases.R b/inst/doc/databases.R
deleted file mode 100644
index c3a1039..0000000
--- a/inst/doc/databases.R
+++ /dev/null
@@ -1,98 +0,0 @@
-## ---- echo = FALSE, message = FALSE--------------------------------------
-knitr::opts_chunk$set(collapse = T, comment = "#>")
-options(tibble.print_min = 4L, tibble.print_max = 4L)
-library(dplyr)
-
-## ---- eval = FALSE-------------------------------------------------------
-# my_db <- src_sqlite("my_db.sqlite3", create = T)
-
-## ---- eval = FALSE-------------------------------------------------------
-# library(nycflights13)
-# flights_sqlite <- copy_to(my_db, flights, temporary = FALSE, indexes = list(
-# c("year", "month", "day"), "carrier", "tailnum"))
-
-## ------------------------------------------------------------------------
-flights_sqlite <- tbl(nycflights13_sqlite(), "flights")
-flights_sqlite
-
-## ---- eval = FALSE-------------------------------------------------------
-# tbl(my_db, sql("SELECT * FROM flights"))
-
-## ------------------------------------------------------------------------
-select(flights_sqlite, year:day, dep_delay, arr_delay)
-filter(flights_sqlite, dep_delay > 240)
-arrange(flights_sqlite, year, month, day)
-mutate(flights_sqlite, speed = air_time / distance)
-summarise(flights_sqlite, delay = mean(dep_time))
-
-## ------------------------------------------------------------------------
-c1 <- filter(flights_sqlite, year == 2013, month == 1, day == 1)
-c2 <- select(c1, year, month, day, carrier, dep_delay, air_time, distance)
-c3 <- mutate(c2, speed = distance / air_time * 60)
-c4 <- arrange(c3, year, month, day, carrier)
-
-## ------------------------------------------------------------------------
-c4
-
-## ------------------------------------------------------------------------
-collect(c4)
-
-## ------------------------------------------------------------------------
-c4$query
-
-## ------------------------------------------------------------------------
-explain(c4)
-
-## ------------------------------------------------------------------------
-# In SQLite variable names are escaped by double quotes:
-translate_sql(x)
-# And strings are escaped by single quotes
-translate_sql("x")
-
-# Many functions have slightly different names
-translate_sql(x == 1 && (y < 2 || z > 3))
-translate_sql(x ^ 2 < 10)
-translate_sql(x %% 2 == 10)
-
-# R and SQL have different defaults for integers and reals.
-# In R, 1 is a real, and 1L is an integer
-# In SQL, 1 is an integer, and 1.0 is a real
-translate_sql(1)
-translate_sql(1L)
-
-## ---- eval = FALSE-------------------------------------------------------
-# translate_sql(mean(x, trim = T))
-# # Error: Invalid number of args to SQL AVG. Expecting 1
-
-## ------------------------------------------------------------------------
-translate_sql(glob(x, y))
-translate_sql(x %like% "ab*")
-
-## ------------------------------------------------------------------------
-by_tailnum <- group_by(flights_sqlite, tailnum)
-delay <- summarise(by_tailnum,
- count = n(),
- dist = mean(distance),
- delay = mean(arr_delay)
-)
-delay <- filter(delay, count > 20, dist < 2000)
-delay_local <- collect(delay)
-
-## ---- eval = FALSE-------------------------------------------------------
-# flights_postgres <- tbl(src_postgres("nycflights13"), "flights")
-
-## ---- eval = FALSE-------------------------------------------------------
-# daily <- group_by(flights_postgres, year, month, day)
-#
-# # Find the most and least delayed flight each day
-# bestworst <- daily %>%
-# select(flight, arr_delay) %>%
-# filter(arr_delay == min(arr_delay) || arr_delay == max(arr_delay))
-# bestworst$query
-#
-# # Rank each flight within a daily
-# ranked <- daily %>%
-# select(arr_delay) %>%
-# mutate(rank = rank(desc(arr_delay)))
-# ranked$query
-
diff --git a/inst/doc/databases.Rmd b/inst/doc/databases.Rmd
deleted file mode 100644
index 12cd4ba..0000000
--- a/inst/doc/databases.Rmd
+++ /dev/null
@@ -1,257 +0,0 @@
----
-title: "Databases"
-date: "`r Sys.Date()`"
-output: rmarkdown::html_vignette
-vignette: >
- %\VignetteIndexEntry{Databases}
- %\VignetteEngine{knitr::rmarkdown}
- \usepackage[utf8]{inputenc}
----
-
-```{r, echo = FALSE, message = FALSE}
-knitr::opts_chunk$set(collapse = T, comment = "#>")
-options(tibble.print_min = 4L, tibble.print_max = 4L)
-library(dplyr)
-```
-
-As well as working with local in-memory data like data frames and data tables, dplyr also works with remote on-disk data stored in databases. Generally, if your data fits in memory there is no advantage to putting it in a database: it will only be slower and more hassle. The reason you'd want to use dplyr with a database is because either your data is already in a database (and you don't want to work with static csv files that someone else has dumped out for you), or you have so much dat [...]
-
-Since R almost exclusively works with in-memory data, if you do have a lot of data in a database, you can't just dump it into R. Instead, you'll have to work with subsets or aggregates. dplyr aims to make this task as easy as possible. If you're working with large data, it's also likely that you'll need support to get the data into the database and to ensure you have the right indices for good performance. While dplyr provides some simple tools to help with these tasks, they are no subst [...]
-
-The motivation for supporting databases in dplyr is that you never pull down the right subset or aggregate from the database on your first try. Usually you have to iterate between R and SQL many times before you get the perfect dataset. But because switching between languages is cognitively challenging (especially because R and SQL are so perilously similar), dplyr helps you by allowing you to write R code that is automatically translated to SQL. The goal of dplyr is not to replace every [...]
-
-To get the most out of this chapter, you'll need to be familiar with querying SQL databases using the `SELECT` statement. If you have some familiarity with SQL and you'd like to learn more, I found [how indexes work in SQLite](http://www.sqlite.org/queryplanner.html) and [10 easy steps to a complete understanding of SQL](http://blog.jooq.org/2016/03/17/10-easy-steps-to-a-complete-understanding-of-sql) to be particularly helpful.
-
-## Getting started
-
-The easiest way to experiement with databases using dplyr is to use SQLite. This is because everything you need is already included in the R package. You won't need to install anything, and you won't need to deal with the hassle of setting up a database server. Doing so is really easy: just give the path and the ok to create a table.
-
-```{r, eval = FALSE}
-my_db <- src_sqlite("my_db.sqlite3", create = T)
-```
-
-The main new concept here is the `src`, which is a collection of types of database tables. Use `src_sqlite()`, `src_mysql()`, `src_postgres()` and `src_bigquery()` to connect to the specific types supported by dplyr.
-
-`my_db` currently has no data in it, so we'll import the `flights` data using the convenient `copy_to()` function. This is a quick and dirty way of getting data into a database. Because all the data has to flow through R, you should note that this is not suitable for very large datasets.
-
-```{r, eval = FALSE}
-library(nycflights13)
-flights_sqlite <- copy_to(my_db, flights, temporary = FALSE, indexes = list(
- c("year", "month", "day"), "carrier", "tailnum"))
-```
-
-As you can see, the `copy_to()` operation has an additional argument that allows you to supply indexes for the table. Here we set up indexes that will allow us to quickly process the data by day, by carrier and by plane. `copy_to()` also executes the SQL `ANALYZE` command: this ensures that the database has up-to-date table statistics and performs the appropriate query optimisations.
-
-For this particular dataset, there's a built-in `src` that will cache `flights` in a standard location:
-
-```{r}
-flights_sqlite <- tbl(nycflights13_sqlite(), "flights")
-flights_sqlite
-```
-
-You can also use arbitrary SQL:
-
-```{r, eval = FALSE}
-tbl(my_db, sql("SELECT * FROM flights"))
-```
-
-## Basic verbs
-
-Whether you're dealing with remote or local data sources, you use the same five verbs:
-
-```{r}
-select(flights_sqlite, year:day, dep_delay, arr_delay)
-filter(flights_sqlite, dep_delay > 240)
-arrange(flights_sqlite, year, month, day)
-mutate(flights_sqlite, speed = air_time / distance)
-summarise(flights_sqlite, delay = mean(dep_time))
-```
-
-The most important difference is that the expressions in `select()`, `filter()`, `arrange()`, `mutate()`, and `summarise()` are translated into SQL so they can be run on the database. While the translations for the most common operations are almost perfect there are some limitations, which you'll learn about later.
-
-## Laziness
-
-When working with databases, dplyr tries to be as lazy as possible:
-
-* It never pulls data into R unless you explicitly ask for it.
-
-* It delays doing any work until the last possible moment: it collects together
- everything you want to do and then sends it to the database in one step.
-
-For example, take the following code:
-
-```{r}
-c1 <- filter(flights_sqlite, year == 2013, month == 1, day == 1)
-c2 <- select(c1, year, month, day, carrier, dep_delay, air_time, distance)
-c3 <- mutate(c2, speed = distance / air_time * 60)
-c4 <- arrange(c3, year, month, day, carrier)
-```
-
-Suprisingly, this sequence of operations never actually touches the database. It's not until you ask for the data (e.g. by printing `c4`) that dplyr generates the SQL and requests the results from the database. Even then it only pulls down 10 rows.
-
-```{r}
-c4
-```
-
-To pull down all the results use `collect()`, which returns a `tbl_df()`:
-
-```{r}
-collect(c4)
-```
-
-You can see the query dplyr has generated by looking at the `query` component of the object:
-
-```{r}
-c4$query
-```
-
-You can also ask the database how it plans to execute the query with `explain()`. The output for SQLite is described in more detail on the [SQLite website](http://www.sqlite.org/eqp.html). It's helpful if you're trying to figure out which indexes are being used.
-
-```{r}
-explain(c4)
-```
-
-### Forcing computation
-
-There are three ways to force the computation of a query:
-
-* `collect()` executes the query and returns the results to R.
-
-* `compute()` executes the query and stores the results in a temporary table
- in the database.
-
-* `collapse()` turns the query into a table expression.
-
-`collect()` is the function you'll use most. Once you reach the set of operations you want, you use collect() to pull the data into a local `tbl_df()`. If you know SQL, you can use `compute()` and `collapse()` to optimise performance.
-
-### Performance considerations
-
-dplyr tries to prevent you from accidentally performing expensive query operations:
-
-* Because there's generally no way to determine how many rows a query will return unless
- you actually run it, `nrow()` is always `NA`.
-
-* Printing a tbl only runs the query for the first 10 rows.
-
-* Because you can't find the last few rows without executing the whole query, you can't use `tail()`.
-
-## SQL translation
-
-When performing the simple mathematical operations used when filtering, mutating or summarising, translating R code to SQL (or indeed to any programming language) is relatively straightforward.
-
-To experiment with this, use `translate_sql()`. The following examples work through some of the basic differences between R and SQL.
-
-```{r}
-# In SQLite variable names are escaped by double quotes:
-translate_sql(x)
-# And strings are escaped by single quotes
-translate_sql("x")
-
-# Many functions have slightly different names
-translate_sql(x == 1 && (y < 2 || z > 3))
-translate_sql(x ^ 2 < 10)
-translate_sql(x %% 2 == 10)
-
-# R and SQL have different defaults for integers and reals.
-# In R, 1 is a real, and 1L is an integer
-# In SQL, 1 is an integer, and 1.0 is a real
-translate_sql(1)
-translate_sql(1L)
-```
-
-dplyr knows how to convert the following R functions to SQL:
-
-* basic math operators: `+`, `-`, `*`, `/`, `%%`, `^`
-* math functions: `abs`, `acos`, `acosh`, `asin`, `asinh`, `atan`, `atan2`,
- `atanh`, `ceiling`, `cos`, `cosh`, `cot`, `coth`, `exp`, `floor`,
- `log`, `log10`, `round`, `sign`, `sin`, `sinh`, `sqrt`, `tan`, `tanh`
-* logical comparisons: `<`, `<=`, `!=`, `>=`, `>`, `==`, `%in%`
-* boolean operations: `&`, `&&`, `|`, `||`, `!`, `xor`
-* basic aggregations: `mean`, `sum`, `min`, `max`, `sd`, `var`
-
-The basic techniques that underlie the implementation of `translate_sql()` are described in the [Advanced R book](http://adv-r.had.co.nz/dsl.html). `translate_sql()` is built on top of R's parsing engine and has been carefully designed to generate correct SQL. It also protects you against SQL injection attacks by correctly escaping the strings and variable names needed by the database that you're connecting to.
-
-Perfect translation is not possible because databases don't have all the functions that R does. The goal of dplyr is to provide a semantic rather than a literal translation: what you mean rather than what is done. In fact, even for functions that exist both in databases and R, you shouldn't expect results to be identical; database programmers have different priorities than R core programmers. For example, in R in order to get a higher level of numerical accuracy, `mean()` loops through t [...]
-
-```{r, eval = FALSE}
-translate_sql(mean(x, trim = T))
-# Error: Invalid number of args to SQL AVG. Expecting 1
-```
-
-Any function that dplyr doesn't know how to convert is left as is. This means that database functions that are not covered by dplyr can be used directly via `translate_sql()`. Here a couple of examples that will work with [SQLite](http://www.sqlite.org/lang_corefunc.html):
-
-```{r}
-translate_sql(glob(x, y))
-translate_sql(x %like% "ab*")
-```
-
-## Grouping
-
-SQLite lacks the window functions that are needed for grouped mutation and filtering. This means that the only really useful operations for grouped SQLite tables are found in `summarise()`. The grouped summarise from the introduction translates well - the only difference is that databases always drop NULLs (their equivalent of missing values), so we don't supply `na.rm = TRUE`.
-
-```{r}
-by_tailnum <- group_by(flights_sqlite, tailnum)
-delay <- summarise(by_tailnum,
- count = n(),
- dist = mean(distance),
- delay = mean(arr_delay)
-)
-delay <- filter(delay, count > 20, dist < 2000)
-delay_local <- collect(delay)
-```
-
-Other databases do support window functions. You can learn about them in the corresponding vignette. It's sometimes possible to simulate grouped filtering and mutation using self joins, which join the original table with a summarised version, but that topic is beyond the scope of this introduction.
-
-## Other databases
-
-Aside from SQLite, the overall workflow is essentially the same regardless of the database you're connecting to. The following sections go in to more details about the peculiarities of each database engine. All of these databases follow a client-server model - a computer that connects to the database and the computer that is running the database (the two may be one and the same but usually isn't). Getting one of these databases up and running is beyond the scope of this article, but ther [...]
-
-### PostgreSQL
-
-`src_postgres()` has five arguments: `dbname`, `host`, `port`, `user` and `password`. If you are running a local postgresql database with the default settings you only need `dbname`. But in most cases, you'll need all five. dplyr uses the RPostgreSQL package to connect to postgres databases. This means that you can't currently connect to remote databases that require a SSL connection (e.g. Heroku).
-
-For example, the following code allows me to connect to a local PostgreSQL database that contains a copy of the `flights` data:
-
-```{r, eval = FALSE}
-flights_postgres <- tbl(src_postgres("nycflights13"), "flights")
-```
-
-PostgreSQL is a considerably more powerful database than SQLite. It has:
-
-* a much wider range of [built-in functions](http://www.postgresql.org/docs/9.3/static/functions.html)
-
-* support for [window functions](http://www.postgresql.org/docs/9.3/static/tutorial-window.html), which allow grouped subset and mutates to work.
-
-The following examples show how we can perform grouped filter and mutate operations with PostgreSQL. Because you can't filter on window functions directly, the SQL generated from the grouped filter is quite complex; so they instead have to go in a subquery.
-
-```{r, eval = FALSE}
-daily <- group_by(flights_postgres, year, month, day)
-
-# Find the most and least delayed flight each day
-bestworst <- daily %>%
- select(flight, arr_delay) %>%
- filter(arr_delay == min(arr_delay) || arr_delay == max(arr_delay))
-bestworst$query
-
-# Rank each flight within a daily
-ranked <- daily %>%
- select(arr_delay) %>%
- mutate(rank = rank(desc(arr_delay)))
-ranked$query
-```
-
-### MySQL and MariaDB
-
-You can connect to MySQL and MariaDB (a recent fork of MySQL) using `src_mysql()`, mediated by the [RMySQL](https://github.com/jeffreyhorner/RMySQL) package. Like PostgreSQL, you'll need to provide a `dbname`, `username`, `password`, `host`, and `port`.
-
-In terms of functionality, MySQL lies somewhere between SQLite and PostgreSQL. It provides a wider range of [built-in functions](http://dev.mysql.com/doc/refman/5.0/en/functions.html), but it does not support window functions (so you can't do grouped mutates and filters).
-
-### BigQuery
-
-BigQuery is a hosted database server provided by Google. To connect, you need to provide your `project`, `dataset` and optionally a project for `billing` (if billing for `project` isn't enabled). After you create the src, your web browser will open and ask you to authenticate. Your credentials are stored in a local cache, so you should only need to do this once.
-
-BigQuery supports only one SQL statement: [SELECT](https://developers.google.com/bigquery/query-reference). Fortunately this is all you need for data analysis. Within SELECT, BigQuery provides comprehensive coverage at a similar level to PostgreSQL.
-
-## Picking a database
-
-If you don't already have a database, here's some advice from my experiences setting up and running all of them. SQLite is by far the easiest to get started with, but the lack of window functions makes it limited for data analysis. PostgreSQL is not too much harder to use and has a wide range of built-in functions. Don't bother with MySQL/MariaDB: it's a pain to set up and the documentation is subpar. Google BigQuery might be a good fit if you have very large data, or if you're willing t [...]
diff --git a/inst/doc/introduction.R b/inst/doc/dplyr.R
similarity index 72%
rename from inst/doc/introduction.R
rename to inst/doc/dplyr.R
index 4931232..6a42a7d 100644
--- a/inst/doc/introduction.R
+++ b/inst/doc/dplyr.R
@@ -3,11 +3,12 @@ knitr::opts_chunk$set(collapse = T, comment = "#>")
options(tibble.print_min = 4L, tibble.print_max = 4L)
library(dplyr)
library(ggplot2)
+set.seed(1014)
## ------------------------------------------------------------------------
library(nycflights13)
dim(flights)
-head(flights)
+flights
## ------------------------------------------------------------------------
filter(flights, month == 1, day == 1)
@@ -15,22 +16,12 @@ filter(flights, month == 1, day == 1)
## ---- eval = FALSE-------------------------------------------------------
# flights[flights$month == 1 & flights$day == 1, ]
-## ---- eval = FALSE-------------------------------------------------------
-# filter(flights, month == 1 | month == 2)
-
-## ------------------------------------------------------------------------
-slice(flights, 1:10)
-
## ------------------------------------------------------------------------
arrange(flights, year, month, day)
## ------------------------------------------------------------------------
arrange(flights, desc(arr_delay))
-## ---- eval = FALSE-------------------------------------------------------
-# flights[order(flights$year, flights$month, flights$day), ]
-# flights[order(flights$arr_delay, decreasing = TRUE), ] or flights[order(-flights$arr_delay), ]
-
## ------------------------------------------------------------------------
# Select columns by name
select(flights, year, month, day)
@@ -46,13 +37,10 @@ select(flights, tail_num = tailnum)
rename(flights, tail_num = tailnum)
## ------------------------------------------------------------------------
-distinct(flights, tailnum)
-distinct(flights, origin, dest)
-
-## ------------------------------------------------------------------------
mutate(flights,
gain = arr_delay - dep_delay,
- speed = distance / air_time * 60)
+ speed = distance / air_time * 60
+)
## ------------------------------------------------------------------------
mutate(flights,
@@ -60,13 +48,6 @@ mutate(flights,
gain_per_hour = gain / (air_time / 60)
)
-## ---- eval = FALSE-------------------------------------------------------
-# transform(flights,
-# gain = arr_delay - delay,
-# gain_per_hour = gain / (air_time / 60)
-# )
-# #> Error: object 'gain' not found
-
## ------------------------------------------------------------------------
transmute(flights,
gain = arr_delay - dep_delay,
@@ -75,7 +56,8 @@ transmute(flights,
## ------------------------------------------------------------------------
summarise(flights,
- delay = mean(dep_delay, na.rm = TRUE))
+ delay = mean(dep_delay, na.rm = TRUE)
+)
## ------------------------------------------------------------------------
sample_n(flights, 10)
@@ -109,6 +91,56 @@ daily <- group_by(flights, year, month, day)
(per_month <- summarise(per_day, flights = sum(flights)))
(per_year <- summarise(per_month, flights = sum(flights)))
+## ------------------------------------------------------------------------
+# `year` represents the integer 1
+select(flights, year)
+select(flights, 1)
+
+## ------------------------------------------------------------------------
+year <- "dep"
+select(flights, starts_with(year))
+
+## ------------------------------------------------------------------------
+year <- 5
+select(flights, year, identity(year))
+
+## ------------------------------------------------------------------------
+vars <- c("year", "month")
+select(flights, vars, "day")
+
+## ------------------------------------------------------------------------
+# Let's create a new `vars` column:
+flights$vars <- flights$year
+
+# The new column won't be an issue if you evaluate `vars` in the
+# context with the `!!` operator:
+vars <- c("year", "month", "day")
+select(flights, !! vars)
+
+## ------------------------------------------------------------------------
+df <- select(flights, year:dep_time)
+
+## ------------------------------------------------------------------------
+mutate(df, "year", 2)
+
+## ------------------------------------------------------------------------
+mutate(df, year + 10)
+
+## ------------------------------------------------------------------------
+var <- seq(1, nrow(df))
+mutate(df, new = var)
+
+## ------------------------------------------------------------------------
+group_by(df, month)
+group_by(df, month = as.factor(month))
+group_by(df, day_binned = cut(day, 3))
+
+## ------------------------------------------------------------------------
+group_by(df, "month")
+
+## ------------------------------------------------------------------------
+group_by_at(df, vars(year:day))
+
## ---- eval = FALSE-------------------------------------------------------
# a1 <- group_by(flights, year, month, day)
# a2 <- select(a1, arr_delay, dep_delay)
diff --git a/vignettes/introduction.Rmd b/inst/doc/dplyr.Rmd
similarity index 54%
rename from vignettes/introduction.Rmd
rename to inst/doc/dplyr.Rmd
index 354320d..346e114 100644
--- a/vignettes/introduction.Rmd
+++ b/inst/doc/dplyr.Rmd
@@ -1,6 +1,5 @@
---
title: "Introduction to dplyr"
-date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Introduction to dplyr}
@@ -13,6 +12,7 @@ knitr::opts_chunk$set(collapse = T, comment = "#>")
options(tibble.print_min = 4L, tibble.print_max = 4L)
library(dplyr)
library(ggplot2)
+set.seed(1014)
```
When working with data you must:
@@ -25,53 +25,42 @@ When working with data you must:
The dplyr package makes these steps fast and easy:
-* By constraining your options, it simplifies how you can think about common data manipulation tasks.
+* By constraining your options, it helps you think about your data manipulation
+ challenges.
-* It provides simple "verbs", functions that correspond to the most common data manipulation tasks, to help you translate those thoughts into code.
+* It provides simple "verbs", functions that correspond to the most common data
+ manipulation tasks, to help you translate your thoughts into code.
-* It uses efficient data storage backends, so you spend less time waiting for the computer.
+* It uses efficient backends, so you spend less time waiting for the computer.
-This document introduces you to dplyr's basic set of tools, and shows you how to apply them to data frames. Other vignettes provide more details on specific topics:
-
-* databases: Besides in-memory data frames, dplyr also connects to out-of-memory, remote databases. And by translating your R code into the appropriate SQL, it allows you to work with both types of data using the same set of tools.
-
-* benchmark-baseball: see how dplyr compares to other tools for data
- manipulation on a realistic use case.
-
-* window-functions: a window function is a variation on an aggregation
- function. Where an aggregate function uses `n` inputs to produce 1
- output, a window function uses `n` inputs to produce `n` outputs.
+This document introduces you to dplyr's basic set of tools, and shows you how to apply them to data frames. dplyr also supports databases via the dbplyr package, once you've installed, read `vignette("dbplyr")` to learn more.
## Data: nycflights13
-To explore the basic data manipulation verbs of dplyr, we'll start with the built in
-`nycflights13` data frame. This dataset contains all `r nrow(nycflights13::flights)` flights that departed from New York City in 2013. The data comes from the US [Bureau of Transportation Statistics](http://www.transtats.bts.gov/DatabaseInfo.asp?DB_ID=120&Link=0), and is documented in `?nycflights13`
+To explore the basic data manipulation verbs of dplyr, we'll use `nycflights13::flights`. This dataset contains all `r nrow(nycflights13::flights)` flights that departed from New York City in 2013. The data comes from the US [Bureau of Transportation Statistics](http://www.transtats.bts.gov/DatabaseInfo.asp?DB_ID=120&Link=0), and is documented in `?nycflights13`
```{r}
library(nycflights13)
dim(flights)
-head(flights)
+flights
```
-dplyr can work with data frames as is, but if you're dealing with large data, it's worthwhile to convert them to a `tbl_df`: this is a wrapper around a data frame that won't accidentally print a lot of data to the screen.
+Note that `nycflights13::flights` is a tibble, a modern reimagining of the data frame. It's particular useful for large datasets because it only prints the first few rows. You can learn more about tibbles at <http://tibble.tidyverse.org>; in particular you can convert data frames to tibbles with `as_tibble()`.
## Single table verbs
Dplyr aims to provide a function for each basic verb of data manipulation:
-* `filter()` (and `slice()`)
-* `arrange()`
-* `select()` (and `rename()`)
-* `distinct()`
-* `mutate()` (and `transmute()`)
-* `summarise()`
-* `sample_n()` (and `sample_frac()`)
-
-If you've used plyr before, many of these will be familar.
+* `filter()` to select cases based on their values.
+* `arrange()` to reorder the cases.
+* `select()` and `rename()` to select variables based on their names.
+* `mutate()` and `transmute()` to add new variables that are functions of existing variables.
+* `summarise()` to condense multiple values to a single value.
+* `sample_n()` and `sample_frac()` to take random samples.
-## Filter rows with `filter()`
+### Filter rows with `filter()`
-`filter()` allows you to select a subset of rows in a data frame. The first argument is the name of the data frame. The second and subsequent arguments are the expressions that filter the data frame:
+`filter()` allows you to select a subset of rows in a data frame. Like all single verbs, the first argument is the tibble (or data frame). The second and subsequent arguments refer to variables within that data frame, selecting rows where the expression is `TRUE`.
For example, we can select all flights on January 1st with:
@@ -79,25 +68,13 @@ For example, we can select all flights on January 1st with:
filter(flights, month == 1, day == 1)
```
-This is equivalent to the more verbose code in base R:
+This is rougly equivalent to this base R code:
```{r, eval = FALSE}
flights[flights$month == 1 & flights$day == 1, ]
```
-`filter()` works similarly to `subset()` except that you can give it any number of filtering conditions, which are joined together with `&` (not `&&` which is easy to do accidentally!). You can also use other boolean operators:
-
-```{r, eval = FALSE}
-filter(flights, month == 1 | month == 2)
-```
-
-To select rows by position, use `slice()`:
-
-```{r}
-slice(flights, 1:10)
-```
-
-## Arrange rows with `arrange()`
+### Arrange rows with `arrange()`
`arrange()` works similarly to `filter()` except that instead of filtering or selecting rows, it reorders them. It takes a data frame, and a set of column names (or more complicated expressions) to order by. If you provide more than one column name, each additional column will be used to break ties in the values of preceding columns:
@@ -111,14 +88,7 @@ Use `desc()` to order a column in descending order:
arrange(flights, desc(arr_delay))
```
-`dplyr::arrange()` works the same way as `plyr::arrange()`. It's a straightforward wrapper around `order()` that requires less typing. The previous code is equivalent to:
-
-```{r, eval = FALSE}
-flights[order(flights$year, flights$month, flights$day), ]
-flights[order(flights$arr_delay, decreasing = TRUE), ] or flights[order(-flights$arr_delay), ]
-```
-
-## Select columns with `select()`
+### Select columns with `select()`
Often you work with large datasets with many columns but only a few are actually of interest to you. `select()` allows you to rapidly zoom in on a useful subset using operations that usually only work on numeric variable positions:
@@ -131,8 +101,6 @@ select(flights, year:day)
select(flights, -(year:day))
```
-This function works similarly to the `select` argument in `base::subset()`. Because the dplyr philosophy is to have small functions that do one thing well, it's its own function in dplyr.
-
There are a number of helper functions you can use within `select()`, like `starts_with()`, `ends_with()`, `matches()` and `contains()`. These let you quickly match larger blocks of variables that meet some criterion. See `?select` for more details.
You can rename variables with `select()` by using named arguments:
@@ -147,28 +115,18 @@ But because `select()` drops all the variables not explicitly mentioned, it's no
rename(flights, tail_num = tailnum)
```
-## Extract distinct (unique) rows
-
-Use `distinct()`to find unique values in a table:
-
-```{r}
-distinct(flights, tailnum)
-distinct(flights, origin, dest)
-```
-
-(This is very similar to `base::unique()` but should be much faster.)
-
-## Add new columns with `mutate()`
+### Add new columns with `mutate()`
Besides selecting sets of existing columns, it's often useful to add new columns that are functions of existing columns. This is the job of `mutate()`:
```{r}
mutate(flights,
gain = arr_delay - dep_delay,
- speed = distance / air_time * 60)
+ speed = distance / air_time * 60
+)
```
-`dplyr::mutate()` works the same way as `plyr::mutate()` and similarly to `base::transform()`. The key difference between `mutate()` and `transform()` is that mutate allows you to refer to columns that you've just created:
+`dplyr::mutate()` is similar to the base `transform()`, but allows you to refer to columns that you've just created:
```{r}
mutate(flights,
@@ -177,14 +135,6 @@ mutate(flights,
)
```
-```{r, eval = FALSE}
-transform(flights,
- gain = arr_delay - delay,
- gain_per_hour = gain / (air_time / 60)
-)
-#> Error: object 'gain' not found
-```
-
If you only want to keep the new variables, use `transmute()`:
```{r}
@@ -194,18 +144,19 @@ transmute(flights,
)
```
-## Summarise values with `summarise()`
+### Summarise values with `summarise()`
-The last verb is `summarise()`. It collapses a data frame to a single row (this is exactly equivalent to `plyr::summarise()`):
+The last verb is `summarise()`. It collapses a data frame to a single row.
```{r}
summarise(flights,
- delay = mean(dep_delay, na.rm = TRUE))
+ delay = mean(dep_delay, na.rm = TRUE)
+)
```
-Below, we'll see how this verb can be very useful.
+It's not that useful until we learn the `group_by()` verb below.
-## Randomly sample rows with `sample_n()` and `sample_frac()`
+### Randomly sample rows with `sample_n()` and `sample_frac()`
You can use `sample_n()` and `sample_frac()` to take a random sample of rows: use `sample_n()` for a fixed number and `sample_frac()` for a fixed fraction.
@@ -216,14 +167,14 @@ sample_frac(flights, 0.01)
Use `replace = TRUE` to perform a bootstrap sample. If needed, you can weight the sample with the `weight` argument.
-## Commonalities
+### Commonalities
You may have noticed that the syntax and function of all these verbs are very similar:
* The first argument is a data frame.
-* The subsequent arguments describe what to do with the data frame. Notice that you can refer
- to columns in the data frame directly without using `$`.
+* The subsequent arguments describe what to do with the data frame. You can
+ refer to columns in the data frame directly without using `$`.
* The result is a new data frame
@@ -231,16 +182,32 @@ Together these properties make it easy to chain together multiple simple steps t
These five functions provide the basis of a language of data manipulation. At the most basic level, you can only alter a tidy data frame in five useful ways: you can reorder the rows (`arrange()`), pick observations and variables of interest (`filter()` and `select()`), add new variables that are functions of existing variables (`mutate()`), or collapse many values to a summary (`summarise()`). The remainder of the language comes from applying the five functions to different types of dat [...]
-# Grouped operations
-These verbs are useful on their own, but they become really powerful when you apply them to groups of observations within a dataset. In dplyr, you do this by with the `group_by()` function. It breaks down a dataset into specified groups of rows. When you then apply the verbs above on the resulting object they'll be automatically applied "by group". Most importantly, all this is achieved by using the same exact syntax you'd use with an ungrouped object.
+## Patterns of operations
+
+The dplyr verbs can be classified by the type of operations they
+accomplish (we sometimes speak of their **semantics**, i.e., their
+meaning). The most important and useful distinction is between grouped
+and ungrouped operations. In addition, it is helpful to have a good
+grasp of the difference between select and mutate operations.
+
+
+### Grouped operations
+
+The dplyr verbs are useful on their own, but they become even more
+powerful when you apply them to groups of observations within a
+dataset. In dplyr, you do this with the `group_by()` function. It
+breaks down a dataset into specified groups of rows. When you then
+apply the verbs above on the resulting object they'll be automatically
+applied "by group".
Grouping affects the verbs as follows:
* grouped `select()` is the same as ungrouped `select()`, except that
grouping variables are always retained.
-* grouped `arrange()` orders first by the grouping variables
+* grouped `arrange()` is the same as ungrouped; unless you set
+ `.by_group = TRUE`, in which case it orders first by the grouping variables
* `mutate()` and `filter()` are most useful in conjunction with window
functions (like `rank()`, or `min(x) == x`). They are described in detail in
@@ -248,13 +215,10 @@ Grouping affects the verbs as follows:
* `sample_n()` and `sample_frac()` sample the specified number/fraction of
rows in each group.
-
-* `slice()` extracts rows within each group.
-
-* `summarise()` is powerful and easy to understand, as described in
- more detail below.
-In the following example, we split the complete dataset into individual planes and then summarise each plane by counting the number of flights (`count = n()`) and computing the average distance (`dist = mean(Distance, na.rm = TRUE)`) and arrival delay (`delay = mean(ArrDelay, na.rm = TRUE)`). We then use ggplot2 to display the output.
+* `summarise()` computes the summary for each group.
+
+In the following example, we split the complete dataset into individual planes and then summarise each plane by counting the number of flights (`count = n()`) and computing the average distance (`dist = mean(distance, na.rm = TRUE)`) and arrival delay (`delay = mean(arr_delay, na.rm = TRUE)`). We then use ggplot2 to display the output.
```{r, warning = FALSE, message = FALSE, fig.width = 6}
by_tailnum <- group_by(flights, tailnum)
@@ -292,8 +256,6 @@ summarise(destinations,
)
```
-You can also use any function that you write yourself. For performance, dplyr provides optimised C++ versions of many of these functions. If you want to provide your own C++ function, see the hybrid-evaluation vignette for more details.
-
When you group by multiple variables, each summary peels off one level of the grouping. That makes it easy to progressively roll-up a dataset:
```{r}
@@ -305,7 +267,166 @@ daily <- group_by(flights, year, month, day)
However you need to be careful when progressively rolling up summaries like this: it's ok for sums and counts, but you need to think about weighting for means and variances (it's not possible to do this exactly for medians).
-## Chaining
+
+### Selecting operations
+
+One of the appealing features of dplyr is that you can refer to
+columns from the tibble as if they were regular variables. However,
+the syntactic uniformity of referring to bare column names hide
+semantical differences across the verbs. A column symbol supplied to
+`select()` does not have the same meaning as the same symbol supplied
+to `mutate()`.
+
+Selecting operations expect column names and positions. Hence, when
+you call `select()` with bare variable names, they actually represent
+their own positions in the tibble. The following calls are completely
+equivalent from dplyr's point of view:
+
+```{r}
+# `year` represents the integer 1
+select(flights, year)
+select(flights, 1)
+```
+
+By the same token, this means that you cannot refer to variables from
+the surrounding context if they have the same name as one of the
+columns. In the following example, `year` still represents 1, not 5:
+
+```r
+year <- 5
+select(flights, year)
+```
+
+One useful subtlety is that this only applies to bare names and to
+selecting calls like `c(year, month, day)` or `year:day`. In all other
+cases, the columns of the data frame are not put in scope. This allows
+you to refer to contextual variables in selection helpers:
+
+```{r}
+year <- "dep"
+select(flights, starts_with(year))
+```
+
+These semantics are usually intuitive. But note the subtle difference:
+
+```{r}
+year <- 5
+select(flights, year, identity(year))
+```
+
+In the first argument, `year` represents its own position `1`. In the
+second argument, `year` is evaluated in the surrounding context and
+represents the fifth column.
+
+For a long time, `select()` used to only understand column positions.
+Counting from dplyr 0.6, it now understands column names as well. This
+makes it a bit easier to program with `select()`:
+
+```{r}
+vars <- c("year", "month")
+select(flights, vars, "day")
+```
+
+Note that the code above is somewhat unsafe because you might have
+added a column named `vars` to the tibble, or you might apply the code
+to another data frame containing such a column. To avoid this issue,
+you can wrap the variable in an `identity()` call as we mentioned
+above, as this will bypass column names. However, a more explicit and
+general method that works in all dplyr verbs is to unquote the
+variable with the `!!` operator. This tells dplyr to bypass the data
+frame and to directly look in the context:
+
+```{r}
+# Let's create a new `vars` column:
+flights$vars <- flights$year
+
+# The new column won't be an issue if you evaluate `vars` in the
+# context with the `!!` operator:
+vars <- c("year", "month", "day")
+select(flights, !! vars)
+```
+
+This operator is very useful when you need to use dplyr within custom
+functions. You can learn more about it in `vignette("programming")`.
+However it is important to understand the semantics of the verbs you
+are unquoting into, that is, the values they understand. As we have
+just seen, `select()` supports names and positions of columns. But
+that won't be the case in other verbs like `mutate()` because they
+have different semantics.
+
+
+### Mutating operations
+
+Mutate semantics are quite different from selection semantics. Whereas
+`select()` expects column names or positions, `mutate()` expects
+*column vectors*. Let's create a smaller tibble for clarity:
+
+```{r}
+df <- select(flights, year:dep_time)
+```
+
+When we use `select()`, the bare column names stand for ther own
+positions in the tibble. For `mutate()` on the other hand, column
+symbols represent the actual column vectors stored in the tibble.
+Consider what happens if we give a string or a number to `mutate()`:
+
+```{r}
+mutate(df, "year", 2)
+```
+
+`mutate()` gets length-1 vectors that it interprets as new columns in
+the data frame. These vectors are recycled so they match the number of
+rows. That's why it doesn't make sense to supply expressions like
+`"year" + 10` to `mutate()`. This amounts to adding 10 to a string!
+The correct expression is:
+
+```{r}
+mutate(df, year + 10)
+```
+
+In the same way, you can unquote values from the context if these
+values represent a valid column. They must be either length 1 (they
+then get recycled) or have the same length as the number of rows. In
+the following example we create a new vector that we add to the data
+frame:
+
+```{r}
+var <- seq(1, nrow(df))
+mutate(df, new = var)
+```
+
+A case in point is `group_by()`. While you might think it has select
+semantics, it actually has mutate semantics. This is quite handy as it
+allows to group by a modified column:
+
+```{r}
+group_by(df, month)
+group_by(df, month = as.factor(month))
+group_by(df, day_binned = cut(day, 3))
+```
+
+This is why you can't supply a column name to `group_by()`. This
+amounts to creating a new column containing the string recycled to the
+number of rows:
+
+```{r}
+group_by(df, "month")
+```
+
+Since grouping with select semantics can be sometimes useful as well,
+we have added the `group_by_at()` variant. In dplyr, variants suffixed
+with `_at()` support selection semantics in their second argument. You
+just need to wrap the selection with `vars()`:
+
+```{r}
+group_by_at(df, vars(year:day))
+```
+
+You can read more about the `_at()` and `_if()` variants in the
+`?scoped` help page.
+
+
+## Piping
The dplyr API is functional in the sense that function calls don't have side-effects. You must always save their results. This doesn't lead to particularly elegant code, especially if you want to do many operations at once. You either have to do it step-by-step:
@@ -318,7 +439,7 @@ a3 <- summarise(a2,
a4 <- filter(a3, arr > 30 | dep > 30)
```
-Or if you don't want to save the intermediate results, you need to wrap the function calls inside each other:
+Or if you don't want to name the intermediate results, you need to wrap the function calls inside each other:
```{r}
filter(
@@ -334,7 +455,7 @@ filter(
)
```
-This is difficult to read because the order of the operations is from inside to out. Thus, the arguments are a long way away from the function. To get around this problem, dplyr provides the `%>%` operator. `x %>% f(y)` turns into `f(x, y)` so you can use it to rewrite multiple operations that you can read left-to-right, top-to-bottom:
+This is difficult to read because the order of the operations is from inside to out. Thus, the arguments are a long way away from the function. To get around this problem, dplyr provides the `%>%` operator from magrittr. `x %>% f(y)` turns into `f(x, y)` so you can use it to rewrite multiple operations that you can read left-to-right, top-to-bottom:
```{r, eval = FALSE}
flights %>%
@@ -347,11 +468,11 @@ flights %>%
filter(arr > 30 | dep > 30)
```
-# Other data sources
+## Other data sources
As well as data frames, dplyr works with data that is stored in other ways, like data tables, databases and multidimensional arrays.
-## Data table
+### Data table
dplyr also provides [data table](http://datatable.r-forge.r-project.org/) methods for all verbs through [dtplyr](http://github.com/hadley/dtplyr). If you're using data.tables already this lets you to use dplyr syntax for data manipulation, and data.table for everything else.
@@ -366,21 +487,15 @@ The advantages of using dplyr with data tables are:
* Instead of one complex method built on the subscripting operator (`[`),
it provides many simple methods.
-## Databases
-
-dplyr also allows you to use the same verbs with a remote database. It takes care of generating the SQL for you so that you can avoid the cognitive challenge of constantly switching between languages. See the databases vignette for more details.
-
-Compared to DBI and the database connection algorithms:
+### Databases
-* it hides, as much as possible, the fact that you're working with a remote database
-* you don't need to know any SQL (although it helps!)
-* it abstracts over the many differences between the different DBI implementations
+dplyr also allows you to use the same verbs with a remote database. It takes care of generating the SQL for you so that you can avoid the cognitive challenge of constantly switching between languages. To use these capabilities, you'll need to install the dbplyr package and then read `vignette("dbplyr")` for the details.
-## Multidimensional arrays / cubes
+### Multidimensional arrays / cubes
`tbl_cube()` provides an experimental interface to multidimensional arrays or data cubes. If you're using this form of data in R, please get in touch so I can better understand your needs.
-# Comparisons
+## Comparisons
Compared to all existing options, dplyr:
diff --git a/inst/doc/dplyr.html b/inst/doc/dplyr.html
new file mode 100644
index 0000000..bd279a0
--- /dev/null
+++ b/inst/doc/dplyr.html
@@ -0,0 +1,664 @@
+<!DOCTYPE html>
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+
+<head>
+
+<meta charset="utf-8" />
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+<meta name="generator" content="pandoc" />
+
+<meta name="viewport" content="width=device-width, initial-scale=1">
+
+
+
+<title>Introduction to dplyr</title>
+
+
+
+<style type="text/css">code{white-space: pre;}</style>
+<style type="text/css">
+div.sourceCode { overflow-x: auto; }
+table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode {
+ margin: 0; padding: 0; vertical-align: baseline; border: none; }
+table.sourceCode { width: 100%; line-height: 100%; }
+td.lineNumbers { text-align: right; padding-right: 4px; padding-left: 4px; color: #aaaaaa; border-right: 1px solid #aaaaaa; }
+td.sourceCode { padding-left: 5px; }
+code > span.kw { color: #007020; font-weight: bold; } /* Keyword */
+code > span.dt { color: #902000; } /* DataType */
+code > span.dv { color: #40a070; } /* DecVal */
+code > span.bn { color: #40a070; } /* BaseN */
+code > span.fl { color: #40a070; } /* Float */
+code > span.ch { color: #4070a0; } /* Char */
+code > span.st { color: #4070a0; } /* String */
+code > span.co { color: #60a0b0; font-style: italic; } /* Comment */
+code > span.ot { color: #007020; } /* Other */
+code > span.al { color: #ff0000; font-weight: bold; } /* Alert */
+code > span.fu { color: #06287e; } /* Function */
+code > span.er { color: #ff0000; font-weight: bold; } /* Error */
+code > span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
+code > span.cn { color: #880000; } /* Constant */
+code > span.sc { color: #4070a0; } /* SpecialChar */
+code > span.vs { color: #4070a0; } /* VerbatimString */
+code > span.ss { color: #bb6688; } /* SpecialString */
+code > span.im { } /* Import */
+code > span.va { color: #19177c; } /* Variable */
+code > span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
+code > span.op { color: #666666; } /* Operator */
+code > span.bu { } /* BuiltIn */
+code > span.ex { } /* Extension */
+code > span.pp { color: #bc7a00; } /* Preprocessor */
+code > span.at { color: #7d9029; } /* Attribute */
+code > span.do { color: #ba2121; font-style: italic; } /* Documentation */
+code > span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
+code > span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
+code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
+</style>
+
+
+
+<link href="data:text/css;charset=utf-8,body%20%7B%0Abackground%2Dcolor%3A%20%23fff%3B%0Amargin%3A%201em%20auto%3B%0Amax%2Dwidth%3A%20700px%3B%0Aoverflow%3A%20visible%3B%0Apadding%2Dleft%3A%202em%3B%0Apadding%2Dright%3A%202em%3B%0Afont%2Dfamily%3A%20%22Open%20Sans%22%2C%20%22Helvetica%20Neue%22%2C%20Helvetica%2C%20Arial%2C%20sans%2Dserif%3B%0Afont%2Dsize%3A%2014px%3B%0Aline%2Dheight%3A%201%2E35%3B%0A%7D%0A%23header%20%7B%0Atext%2Dalign%3A%20center%3B%0A%7D%0A%23TOC%20%7B%0Aclear%3A%20bot [...]
+
+</head>
+
+<body>
+
+
+
+
+<h1 class="title toc-ignore">Introduction to dplyr</h1>
+
+
+
+<p>When working with data you must:</p>
+<ul>
+<li><p>Figure out what you want to do.</p></li>
+<li><p>Describe those tasks in the form of a computer program.</p></li>
+<li><p>Execute the program.</p></li>
+</ul>
+<p>The dplyr package makes these steps fast and easy:</p>
+<ul>
+<li><p>By constraining your options, it helps you think about your data manipulation challenges.</p></li>
+<li><p>It provides simple “verbs”, functions that correspond to the most common data manipulation tasks, to help you translate your thoughts into code.</p></li>
+<li><p>It uses efficient backends, so you spend less time waiting for the computer.</p></li>
+</ul>
+<p>This document introduces you to dplyr’s basic set of tools, and shows you how to apply them to data frames. dplyr also supports databases via the dbplyr package, once you’ve installed, read <code>vignette("dbplyr")</code> to learn more.</p>
+<div id="data-nycflights13" class="section level2">
+<h2>Data: nycflights13</h2>
+<p>To explore the basic data manipulation verbs of dplyr, we’ll use <code>nycflights13::flights</code>. This dataset contains all 336776 flights that departed from New York City in 2013. The data comes from the US <a href="http://www.transtats.bts.gov/DatabaseInfo.asp?DB_ID=120&Link=0">Bureau of Transportation Statistics</a>, and is documented in <code>?nycflights13</code></p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(nycflights13)
+<span class="kw">dim</span>(flights)
+<span class="co">#> [1] 336776 19</span>
+flights
+<span class="co">#> # A tibble: 336,776 x 19</span>
+<span class="co">#> year month day dep_t… sche… dep_… arr_… sche… arr_… carr… flig… tail…</span>
+<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr></span>
+<span class="co">#> 1 2013 1 1 517 515 2.00 830 819 11.0 UA 1545 N142…</span>
+<span class="co">#> 2 2013 1 1 533 529 4.00 850 830 20.0 UA 1714 N242…</span>
+<span class="co">#> 3 2013 1 1 542 540 2.00 923 850 33.0 AA 1141 N619…</span>
+<span class="co">#> 4 2013 1 1 544 545 -1.00 1004 1022 -18.0 B6 725 N804…</span>
+<span class="co">#> # ... with 336,772 more rows, and 7 more variables: origin <chr>,</span>
+<span class="co">#> # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,</span>
+<span class="co">#> # time_hour <dttm></span></code></pre></div>
+<p>Note that <code>nycflights13::flights</code> is a tibble, a modern reimagining of the data frame. It’s particular useful for large datasets because it only prints the first few rows. You can learn more about tibbles at <a href="http://tibble.tidyverse.org" class="uri">http://tibble.tidyverse.org</a>; in particular you can convert data frames to tibbles with <code>as_tibble()</code>.</p>
+</div>
+<div id="single-table-verbs" class="section level2">
+<h2>Single table verbs</h2>
+<p>Dplyr aims to provide a function for each basic verb of data manipulation:</p>
+<ul>
+<li><code>filter()</code> to select cases based on their values.</li>
+<li><code>arrange()</code> to reorder the cases.</li>
+<li><code>select()</code> and <code>rename()</code> to select variables based on their names.</li>
+<li><code>mutate()</code> and <code>transmute()</code> to add new variables that are functions of existing variables.</li>
+<li><code>summarise()</code> to condense multiple values to a single value.</li>
+<li><code>sample_n()</code> and <code>sample_frac()</code> to take random samples.</li>
+</ul>
+<div id="filter-rows-with-filter" class="section level3">
+<h3>Filter rows with <code>filter()</code></h3>
+<p><code>filter()</code> allows you to select a subset of rows in a data frame. Like all single verbs, the first argument is the tibble (or data frame). The second and subsequent arguments refer to variables within that data frame, selecting rows where the expression is <code>TRUE</code>.</p>
+<p>For example, we can select all flights on January 1st with:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">filter</span>(flights, month <span class="op">==</span><span class="st"> </span><span class="dv">1</span>, day <span class="op">==</span><span class="st"> </span><span class="dv">1</span>)
+<span class="co">#> # A tibble: 842 x 19</span>
+<span class="co">#> year month day dep_t… sche… dep_… arr_… sche… arr_… carr… flig… tail…</span>
+<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr></span>
+<span class="co">#> 1 2013 1 1 517 515 2.00 830 819 11.0 UA 1545 N142…</span>
+<span class="co">#> 2 2013 1 1 533 529 4.00 850 830 20.0 UA 1714 N242…</span>
+<span class="co">#> 3 2013 1 1 542 540 2.00 923 850 33.0 AA 1141 N619…</span>
+<span class="co">#> 4 2013 1 1 544 545 -1.00 1004 1022 -18.0 B6 725 N804…</span>
+<span class="co">#> # ... with 838 more rows, and 7 more variables: origin <chr>, dest <chr>,</span>
+<span class="co">#> # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,</span>
+<span class="co">#> # time_hour <dttm></span></code></pre></div>
+<p>This is rougly equivalent to this base R code:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights[flights<span class="op">$</span>month <span class="op">==</span><span class="st"> </span><span class="dv">1</span> <span class="op">&</span><span class="st"> </span>flights<span class="op">$</span>day <span class="op">==</span><span class="st"> </span><span class="dv">1</span>, ]</code></pre></div>
+</div>
+<div id="arrange-rows-with-arrange" class="section level3">
+<h3>Arrange rows with <code>arrange()</code></h3>
+<p><code>arrange()</code> works similarly to <code>filter()</code> except that instead of filtering or selecting rows, it reorders them. It takes a data frame, and a set of column names (or more complicated expressions) to order by. If you provide more than one column name, each additional column will be used to break ties in the values of preceding columns:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">arrange</span>(flights, year, month, day)
+<span class="co">#> # A tibble: 336,776 x 19</span>
+<span class="co">#> year month day dep_t… sche… dep_… arr_… sche… arr_… carr… flig… tail…</span>
+<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr></span>
+<span class="co">#> 1 2013 1 1 517 515 2.00 830 819 11.0 UA 1545 N142…</span>
+<span class="co">#> 2 2013 1 1 533 529 4.00 850 830 20.0 UA 1714 N242…</span>
+<span class="co">#> 3 2013 1 1 542 540 2.00 923 850 33.0 AA 1141 N619…</span>
+<span class="co">#> 4 2013 1 1 544 545 -1.00 1004 1022 -18.0 B6 725 N804…</span>
+<span class="co">#> # ... with 336,772 more rows, and 7 more variables: origin <chr>,</span>
+<span class="co">#> # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,</span>
+<span class="co">#> # time_hour <dttm></span></code></pre></div>
+<p>Use <code>desc()</code> to order a column in descending order:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">arrange</span>(flights, <span class="kw">desc</span>(arr_delay))
+<span class="co">#> # A tibble: 336,776 x 19</span>
+<span class="co">#> year month day dep_t… sche… dep_… arr_… sche… arr_… carr… flig… tail…</span>
+<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr></span>
+<span class="co">#> 1 2013 1 9 641 900 1301 1242 1530 1272 HA 51 N384…</span>
+<span class="co">#> 2 2013 6 15 1432 1935 1137 1607 2120 1127 MQ 3535 N504…</span>
+<span class="co">#> 3 2013 1 10 1121 1635 1126 1239 1810 1109 MQ 3695 N517…</span>
+<span class="co">#> 4 2013 9 20 1139 1845 1014 1457 2210 1007 AA 177 N338…</span>
+<span class="co">#> # ... with 336,772 more rows, and 7 more variables: origin <chr>,</span>
+<span class="co">#> # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,</span>
+<span class="co">#> # time_hour <dttm></span></code></pre></div>
+</div>
+<div id="select-columns-with-select" class="section level3">
+<h3>Select columns with <code>select()</code></h3>
+<p>Often you work with large datasets with many columns but only a few are actually of interest to you. <code>select()</code> allows you to rapidly zoom in on a useful subset using operations that usually only work on numeric variable positions:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Select columns by name</span>
+<span class="kw">select</span>(flights, year, month, day)
+<span class="co">#> # A tibble: 336,776 x 3</span>
+<span class="co">#> year month day</span>
+<span class="co">#> <int> <int> <int></span>
+<span class="co">#> 1 2013 1 1</span>
+<span class="co">#> 2 2013 1 1</span>
+<span class="co">#> 3 2013 1 1</span>
+<span class="co">#> 4 2013 1 1</span>
+<span class="co">#> # ... with 336,772 more rows</span>
+<span class="co"># Select all columns between year and day (inclusive)</span>
+<span class="kw">select</span>(flights, year<span class="op">:</span>day)
+<span class="co">#> # A tibble: 336,776 x 3</span>
+<span class="co">#> year month day</span>
+<span class="co">#> <int> <int> <int></span>
+<span class="co">#> 1 2013 1 1</span>
+<span class="co">#> 2 2013 1 1</span>
+<span class="co">#> 3 2013 1 1</span>
+<span class="co">#> 4 2013 1 1</span>
+<span class="co">#> # ... with 336,772 more rows</span>
+<span class="co"># Select all columns except those from year to day (inclusive)</span>
+<span class="kw">select</span>(flights, <span class="op">-</span>(year<span class="op">:</span>day))
+<span class="co">#> # A tibble: 336,776 x 16</span>
+<span class="co">#> dep_t… sche… dep_… arr_… sche… arr_… carr… flig… tail… orig… dest air_…</span>
+<span class="co">#> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl></span>
+<span class="co">#> 1 517 515 2.00 830 819 11.0 UA 1545 N142… EWR IAH 227</span>
+<span class="co">#> 2 533 529 4.00 850 830 20.0 UA 1714 N242… LGA IAH 227</span>
+<span class="co">#> 3 542 540 2.00 923 850 33.0 AA 1141 N619… JFK MIA 160</span>
+<span class="co">#> 4 544 545 -1.00 1004 1022 -18.0 B6 725 N804… JFK BQN 183</span>
+<span class="co">#> # ... with 336,772 more rows, and 4 more variables: distance <dbl>,</span>
+<span class="co">#> # hour <dbl>, minute <dbl>, time_hour <dttm></span></code></pre></div>
+<p>There are a number of helper functions you can use within <code>select()</code>, like <code>starts_with()</code>, <code>ends_with()</code>, <code>matches()</code> and <code>contains()</code>. These let you quickly match larger blocks of variables that meet some criterion. See <code>?select</code> for more details.</p>
+<p>You can rename variables with <code>select()</code> by using named arguments:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">select</span>(flights, <span class="dt">tail_num =</span> tailnum)
+<span class="co">#> # A tibble: 336,776 x 1</span>
+<span class="co">#> tail_num</span>
+<span class="co">#> <chr> </span>
+<span class="co">#> 1 N14228 </span>
+<span class="co">#> 2 N24211 </span>
+<span class="co">#> 3 N619AA </span>
+<span class="co">#> 4 N804JB </span>
+<span class="co">#> # ... with 336,772 more rows</span></code></pre></div>
+<p>But because <code>select()</code> drops all the variables not explicitly mentioned, it’s not that useful. Instead, use <code>rename()</code>:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">rename</span>(flights, <span class="dt">tail_num =</span> tailnum)
+<span class="co">#> # A tibble: 336,776 x 19</span>
+<span class="co">#> year month day dep_t… sche… dep_… arr_… sche… arr_… carr… flig… tail…</span>
+<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr></span>
+<span class="co">#> 1 2013 1 1 517 515 2.00 830 819 11.0 UA 1545 N142…</span>
+<span class="co">#> 2 2013 1 1 533 529 4.00 850 830 20.0 UA 1714 N242…</span>
+<span class="co">#> 3 2013 1 1 542 540 2.00 923 850 33.0 AA 1141 N619…</span>
+<span class="co">#> 4 2013 1 1 544 545 -1.00 1004 1022 -18.0 B6 725 N804…</span>
+<span class="co">#> # ... with 336,772 more rows, and 7 more variables: origin <chr>,</span>
+<span class="co">#> # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,</span>
+<span class="co">#> # time_hour <dttm></span></code></pre></div>
+</div>
+<div id="add-new-columns-with-mutate" class="section level3">
+<h3>Add new columns with <code>mutate()</code></h3>
+<p>Besides selecting sets of existing columns, it’s often useful to add new columns that are functions of existing columns. This is the job of <code>mutate()</code>:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">mutate</span>(flights,
+ <span class="dt">gain =</span> arr_delay <span class="op">-</span><span class="st"> </span>dep_delay,
+ <span class="dt">speed =</span> distance <span class="op">/</span><span class="st"> </span>air_time <span class="op">*</span><span class="st"> </span><span class="dv">60</span>
+)
+<span class="co">#> # A tibble: 336,776 x 21</span>
+<span class="co">#> year month day dep_t… sche… dep_… arr_… sche… arr_… carr… flig… tail…</span>
+<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr></span>
+<span class="co">#> 1 2013 1 1 517 515 2.00 830 819 11.0 UA 1545 N142…</span>
+<span class="co">#> 2 2013 1 1 533 529 4.00 850 830 20.0 UA 1714 N242…</span>
+<span class="co">#> 3 2013 1 1 542 540 2.00 923 850 33.0 AA 1141 N619…</span>
+<span class="co">#> 4 2013 1 1 544 545 -1.00 1004 1022 -18.0 B6 725 N804…</span>
+<span class="co">#> # ... with 336,772 more rows, and 9 more variables: origin <chr>,</span>
+<span class="co">#> # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,</span>
+<span class="co">#> # time_hour <dttm>, gain <dbl>, speed <dbl></span></code></pre></div>
+<p><code>dplyr::mutate()</code> is similar to the base <code>transform()</code>, but allows you to refer to columns that you’ve just created:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">mutate</span>(flights,
+ <span class="dt">gain =</span> arr_delay <span class="op">-</span><span class="st"> </span>dep_delay,
+ <span class="dt">gain_per_hour =</span> gain <span class="op">/</span><span class="st"> </span>(air_time <span class="op">/</span><span class="st"> </span><span class="dv">60</span>)
+)
+<span class="co">#> # A tibble: 336,776 x 21</span>
+<span class="co">#> year month day dep_t… sche… dep_… arr_… sche… arr_… carr… flig… tail…</span>
+<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr></span>
+<span class="co">#> 1 2013 1 1 517 515 2.00 830 819 11.0 UA 1545 N142…</span>
+<span class="co">#> 2 2013 1 1 533 529 4.00 850 830 20.0 UA 1714 N242…</span>
+<span class="co">#> 3 2013 1 1 542 540 2.00 923 850 33.0 AA 1141 N619…</span>
+<span class="co">#> 4 2013 1 1 544 545 -1.00 1004 1022 -18.0 B6 725 N804…</span>
+<span class="co">#> # ... with 336,772 more rows, and 9 more variables: origin <chr>,</span>
+<span class="co">#> # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,</span>
+<span class="co">#> # time_hour <dttm>, gain <dbl>, gain_per_hour <dbl></span></code></pre></div>
+<p>If you only want to keep the new variables, use <code>transmute()</code>:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">transmute</span>(flights,
+ <span class="dt">gain =</span> arr_delay <span class="op">-</span><span class="st"> </span>dep_delay,
+ <span class="dt">gain_per_hour =</span> gain <span class="op">/</span><span class="st"> </span>(air_time <span class="op">/</span><span class="st"> </span><span class="dv">60</span>)
+)
+<span class="co">#> # A tibble: 336,776 x 2</span>
+<span class="co">#> gain gain_per_hour</span>
+<span class="co">#> <dbl> <dbl></span>
+<span class="co">#> 1 9.00 2.38</span>
+<span class="co">#> 2 16.0 4.23</span>
+<span class="co">#> 3 31.0 11.6 </span>
+<span class="co">#> 4 -17.0 - 5.57</span>
+<span class="co">#> # ... with 336,772 more rows</span></code></pre></div>
+</div>
+<div id="summarise-values-with-summarise" class="section level3">
+<h3>Summarise values with <code>summarise()</code></h3>
+<p>The last verb is <code>summarise()</code>. It collapses a data frame to a single row.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">summarise</span>(flights,
+ <span class="dt">delay =</span> <span class="kw">mean</span>(dep_delay, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>)
+)
+<span class="co">#> # A tibble: 1 x 1</span>
+<span class="co">#> delay</span>
+<span class="co">#> <dbl></span>
+<span class="co">#> 1 12.6</span></code></pre></div>
+<p>It’s not that useful until we learn the <code>group_by()</code> verb below.</p>
+</div>
+<div id="randomly-sample-rows-with-sample_n-and-sample_frac" class="section level3">
+<h3>Randomly sample rows with <code>sample_n()</code> and <code>sample_frac()</code></h3>
+<p>You can use <code>sample_n()</code> and <code>sample_frac()</code> to take a random sample of rows: use <code>sample_n()</code> for a fixed number and <code>sample_frac()</code> for a fixed fraction.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">sample_n</span>(flights, <span class="dv">10</span>)
+<span class="co">#> # A tibble: 10 x 19</span>
+<span class="co">#> year month day dep_t… sched_… dep_de… arr_… sched… arr_d… carr… flig…</span>
+<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int></span>
+<span class="co">#> 1 2013 10 1 822 825 - 3.00 932 935 - 3.00 AA 84</span>
+<span class="co">#> 2 2013 8 2 712 715 - 3.00 1015 1010 5.00 VX 399</span>
+<span class="co">#> 3 2013 5 10 1309 1315 - 6.00 1502 1501 1.00 US 1895</span>
+<span class="co">#> 4 2013 10 28 2002 1930 32.0 2318 2250 28.0 DL 795</span>
+<span class="co">#> # ... with 6 more rows, and 8 more variables: tailnum <chr>, origin <chr>,</span>
+<span class="co">#> # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,</span>
+<span class="co">#> # time_hour <dttm></span>
+<span class="kw">sample_frac</span>(flights, <span class="fl">0.01</span>)
+<span class="co">#> # A tibble: 3,368 x 19</span>
+<span class="co">#> year month day dep_t… sche… dep_… arr_… sche… arr_… carr… flig… tail…</span>
+<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr></span>
+<span class="co">#> 1 2013 8 16 827 830 -3.00 928 950 -22.0 AA 1838 N3CA…</span>
+<span class="co">#> 2 2013 11 4 1306 1300 6.00 1639 1610 29.0 VX 411 N641…</span>
+<span class="co">#> 3 2013 1 14 929 935 -6.00 1213 1238 -25.0 B6 361 N639…</span>
+<span class="co">#> 4 2013 12 28 625 630 -5.00 916 1014 -58.0 US 690 N656…</span>
+<span class="co">#> # ... with 3,364 more rows, and 7 more variables: origin <chr>,</span>
+<span class="co">#> # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,</span>
+<span class="co">#> # time_hour <dttm></span></code></pre></div>
+<p>Use <code>replace = TRUE</code> to perform a bootstrap sample. If needed, you can weight the sample with the <code>weight</code> argument.</p>
+</div>
+<div id="commonalities" class="section level3">
+<h3>Commonalities</h3>
+<p>You may have noticed that the syntax and function of all these verbs are very similar:</p>
+<ul>
+<li><p>The first argument is a data frame.</p></li>
+<li><p>The subsequent arguments describe what to do with the data frame. You can refer to columns in the data frame directly without using <code>$</code>.</p></li>
+<li><p>The result is a new data frame</p></li>
+</ul>
+<p>Together these properties make it easy to chain together multiple simple steps to achieve a complex result.</p>
+<p>These five functions provide the basis of a language of data manipulation. At the most basic level, you can only alter a tidy data frame in five useful ways: you can reorder the rows (<code>arrange()</code>), pick observations and variables of interest (<code>filter()</code> and <code>select()</code>), add new variables that are functions of existing variables (<code>mutate()</code>), or collapse many values to a summary (<code>summarise()</code>). The remainder of the language comes [...]
+</div>
+</div>
+<div id="patterns-of-operations" class="section level2">
+<h2>Patterns of operations</h2>
+<p>The dplyr verbs can be classified by the type of operations they accomplish (we sometimes speak of their <strong>semantics</strong>, i.e., their meaning). The most important and useful distinction is between grouped and ungrouped operations. In addition, it is helpful to have a good grasp of the difference between select and mutate operations.</p>
+<div id="grouped-operations" class="section level3">
+<h3>Grouped operations</h3>
+<p>The dplyr verbs are useful on their own, but they become even more powerful when you apply them to groups of observations within a dataset. In dplyr, you do this with the <code>group_by()</code> function. It breaks down a dataset into specified groups of rows. When you then apply the verbs above on the resulting object they’ll be automatically applied “by group”.</p>
+<p>Grouping affects the verbs as follows:</p>
+<ul>
+<li><p>grouped <code>select()</code> is the same as ungrouped <code>select()</code>, except that grouping variables are always retained.</p></li>
+<li><p>grouped <code>arrange()</code> is the same as ungrouped; unless you set <code>.by_group = TRUE</code>, in which case it orders first by the grouping variables</p></li>
+<li><p><code>mutate()</code> and <code>filter()</code> are most useful in conjunction with window functions (like <code>rank()</code>, or <code>min(x) == x</code>). They are described in detail in <code>vignette("window-functions")</code>.</p></li>
+<li><p><code>sample_n()</code> and <code>sample_frac()</code> sample the specified number/fraction of rows in each group.</p></li>
+<li><p><code>summarise()</code> computes the summary for each group.</p></li>
+</ul>
+<p>In the following example, we split the complete dataset into individual planes and then summarise each plane by counting the number of flights (<code>count = n()</code>) and computing the average distance (<code>dist = mean(distance, na.rm = TRUE)</code>) and arrival delay (<code>delay = mean(arr_delay, na.rm = TRUE)</code>). We then use ggplot2 to display the output.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">by_tailnum <-<span class="st"> </span><span class="kw">group_by</span>(flights, tailnum)
+delay <-<span class="st"> </span><span class="kw">summarise</span>(by_tailnum,
+ <span class="dt">count =</span> <span class="kw">n</span>(),
+ <span class="dt">dist =</span> <span class="kw">mean</span>(distance, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
+ <span class="dt">delay =</span> <span class="kw">mean</span>(arr_delay, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>))
+delay <-<span class="st"> </span><span class="kw">filter</span>(delay, count <span class="op">></span><span class="st"> </span><span class="dv">20</span>, dist <span class="op"><</span><span class="st"> </span><span class="dv">2000</span>)
+
+<span class="co"># Interestingly, the average delay is only slightly related to the</span>
+<span class="co"># average distance flown by a plane.</span>
+<span class="kw">ggplot</span>(delay, <span class="kw">aes</span>(dist, delay)) <span class="op">+</span>
+<span class="st"> </span><span class="kw">geom_point</span>(<span class="kw">aes</span>(<span class="dt">size =</span> count), <span class="dt">alpha =</span> <span class="dv">1</span><span class="op">/</span><span class="dv">2</span>) <span class="op">+</span>
+<span class="st"> </span><span class="kw">geom_smooth</span>() <span class="op">+</span>
+<span class="st"> </span><span class="kw">scale_size_area</span>()</code></pre></div>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAkAAAAEgCAIAAADT5FumAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAgAElEQVR4nOydZ1wUV9vwZ7bvsmyj96X3IkWaCjbsXWPst9GYaEzRPClGTeKdHr29jYktliQKakzsBbAriAUpAkrvfRd2l+1tdt8P5/fsy4P0sizx/D8tw8yca2dm5zrnqqher0cgEAgEAhlp4IZbAAgEAoFA+gNUYBAIBAIZkUAFBoFAIJARCVRgEAgEAhmRQAUGgUAgkBEJVGAQCAQCGZEQhluAnpHJZDqdzggDoShKIBA0Go0RxuofFApFq9VqtdrhFqRLSCSSWq0ebim6hEKhYBhmyrfYxC8gmUzW6/WmLKGJX0ASiYQgSL8lNDc3H1RxRjwjQIFpNBrjvLLxeDwej1epVEYYq3/Q6XS1Wm2yEqIoSiKRTFY8BEFoN [...]
+<p>You use <code>summarise()</code> with <strong>aggregate functions</strong>, which take a vector of values and return a single number. There are many useful examples of such functions in base R like <code>min()</code>, <code>max()</code>, <code>mean()</code>, <code>sum()</code>, <code>sd()</code>, <code>median()</code>, and <code>IQR()</code>. dplyr provides a handful of others:</p>
+<ul>
+<li><p><code>n()</code>: the number of observations in the current group</p></li>
+<li><p><code>n_distinct(x)</code>:the number of unique values in <code>x</code>.</p></li>
+<li><p><code>first(x)</code>, <code>last(x)</code> and <code>nth(x, n)</code> - these work similarly to <code>x[1]</code>, <code>x[length(x)]</code>, and <code>x[n]</code> but give you more control over the result if the value is missing.</p></li>
+</ul>
+<p>For example, we could use these to find the number of planes and the number of flights that go to each possible destination:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">destinations <-<span class="st"> </span><span class="kw">group_by</span>(flights, dest)
+<span class="kw">summarise</span>(destinations,
+ <span class="dt">planes =</span> <span class="kw">n_distinct</span>(tailnum),
+ <span class="dt">flights =</span> <span class="kw">n</span>()
+)
+<span class="co">#> # A tibble: 105 x 3</span>
+<span class="co">#> dest planes flights</span>
+<span class="co">#> <chr> <int> <int></span>
+<span class="co">#> 1 ABQ 108 254</span>
+<span class="co">#> 2 ACK 58 265</span>
+<span class="co">#> 3 ALB 172 439</span>
+<span class="co">#> 4 ANC 6 8</span>
+<span class="co">#> # ... with 101 more rows</span></code></pre></div>
+<p>When you group by multiple variables, each summary peels off one level of the grouping. That makes it easy to progressively roll-up a dataset:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">daily <-<span class="st"> </span><span class="kw">group_by</span>(flights, year, month, day)
+(per_day <-<span class="st"> </span><span class="kw">summarise</span>(daily, <span class="dt">flights =</span> <span class="kw">n</span>()))
+<span class="co">#> # A tibble: 365 x 4</span>
+<span class="co">#> # Groups: year, month [?]</span>
+<span class="co">#> year month day flights</span>
+<span class="co">#> <int> <int> <int> <int></span>
+<span class="co">#> 1 2013 1 1 842</span>
+<span class="co">#> 2 2013 1 2 943</span>
+<span class="co">#> 3 2013 1 3 914</span>
+<span class="co">#> 4 2013 1 4 915</span>
+<span class="co">#> # ... with 361 more rows</span>
+(per_month <-<span class="st"> </span><span class="kw">summarise</span>(per_day, <span class="dt">flights =</span> <span class="kw">sum</span>(flights)))
+<span class="co">#> # A tibble: 12 x 3</span>
+<span class="co">#> # Groups: year [?]</span>
+<span class="co">#> year month flights</span>
+<span class="co">#> <int> <int> <int></span>
+<span class="co">#> 1 2013 1 27004</span>
+<span class="co">#> 2 2013 2 24951</span>
+<span class="co">#> 3 2013 3 28834</span>
+<span class="co">#> 4 2013 4 28330</span>
+<span class="co">#> # ... with 8 more rows</span>
+(per_year <-<span class="st"> </span><span class="kw">summarise</span>(per_month, <span class="dt">flights =</span> <span class="kw">sum</span>(flights)))
+<span class="co">#> # A tibble: 1 x 2</span>
+<span class="co">#> year flights</span>
+<span class="co">#> <int> <int></span>
+<span class="co">#> 1 2013 336776</span></code></pre></div>
+<p>However you need to be careful when progressively rolling up summaries like this: it’s ok for sums and counts, but you need to think about weighting for means and variances (it’s not possible to do this exactly for medians).</p>
+</div>
+<div id="selecting-operations" class="section level3">
+<h3>Selecting operations</h3>
+<p>One of the appealing features of dplyr is that you can refer to columns from the tibble as if they were regular variables. However, the syntactic uniformity of referring to bare column names hide semantical differences across the verbs. A column symbol supplied to <code>select()</code> does not have the same meaning as the same symbol supplied to <code>mutate()</code>.</p>
+<p>Selecting operations expect column names and positions. Hence, when you call <code>select()</code> with bare variable names, they actually represent their own positions in the tibble. The following calls are completely equivalent from dplyr’s point of view:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># `year` represents the integer 1</span>
+<span class="kw">select</span>(flights, year)
+<span class="co">#> # A tibble: 336,776 x 1</span>
+<span class="co">#> year</span>
+<span class="co">#> <int></span>
+<span class="co">#> 1 2013</span>
+<span class="co">#> 2 2013</span>
+<span class="co">#> 3 2013</span>
+<span class="co">#> 4 2013</span>
+<span class="co">#> # ... with 336,772 more rows</span>
+<span class="kw">select</span>(flights, <span class="dv">1</span>)
+<span class="co">#> # A tibble: 336,776 x 1</span>
+<span class="co">#> year</span>
+<span class="co">#> <int></span>
+<span class="co">#> 1 2013</span>
+<span class="co">#> 2 2013</span>
+<span class="co">#> 3 2013</span>
+<span class="co">#> 4 2013</span>
+<span class="co">#> # ... with 336,772 more rows</span></code></pre></div>
+<p>By the same token, this means that you cannot refer to variables from the surrounding context if they have the same name as one of the columns. In the following example, <code>year</code> still represents 1, not 5:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">year <-<span class="st"> </span><span class="dv">5</span>
+<span class="kw">select</span>(flights, year)</code></pre></div>
+<p>One useful subtlety is that this only applies to bare names and to selecting calls like <code>c(year, month, day)</code> or <code>year:day</code>. In all other cases, the columns of the data frame are not put in scope. This allows you to refer to contextual variables in selection helpers:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">year <-<span class="st"> "dep"</span>
+<span class="kw">select</span>(flights, <span class="kw">starts_with</span>(year))
+<span class="co">#> # A tibble: 336,776 x 2</span>
+<span class="co">#> dep_time dep_delay</span>
+<span class="co">#> <int> <dbl></span>
+<span class="co">#> 1 517 2.00</span>
+<span class="co">#> 2 533 4.00</span>
+<span class="co">#> 3 542 2.00</span>
+<span class="co">#> 4 544 -1.00</span>
+<span class="co">#> # ... with 336,772 more rows</span></code></pre></div>
+<p>These semantics are usually intuitive. But note the subtle difference:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">year <-<span class="st"> </span><span class="dv">5</span>
+<span class="kw">select</span>(flights, year, <span class="kw">identity</span>(year))
+<span class="co">#> # A tibble: 336,776 x 2</span>
+<span class="co">#> year sched_dep_time</span>
+<span class="co">#> <int> <int></span>
+<span class="co">#> 1 2013 515</span>
+<span class="co">#> 2 2013 529</span>
+<span class="co">#> 3 2013 540</span>
+<span class="co">#> 4 2013 545</span>
+<span class="co">#> # ... with 336,772 more rows</span></code></pre></div>
+<p>In the first argument, <code>year</code> represents its own position <code>1</code>. In the second argument, <code>year</code> is evaluated in the surrounding context and represents the fifth column.</p>
+<p>For a long time, <code>select()</code> used to only understand column positions. Counting from dplyr 0.6, it now understands column names as well. This makes it a bit easier to program with <code>select()</code>:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">vars <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"year"</span>, <span class="st">"month"</span>)
+<span class="kw">select</span>(flights, vars, <span class="st">"day"</span>)
+<span class="co">#> # A tibble: 336,776 x 3</span>
+<span class="co">#> year month day</span>
+<span class="co">#> <int> <int> <int></span>
+<span class="co">#> 1 2013 1 1</span>
+<span class="co">#> 2 2013 1 1</span>
+<span class="co">#> 3 2013 1 1</span>
+<span class="co">#> 4 2013 1 1</span>
+<span class="co">#> # ... with 336,772 more rows</span></code></pre></div>
+<p>Note that the code above is somewhat unsafe because you might have added a column named <code>vars</code> to the tibble, or you might apply the code to another data frame containing such a column. To avoid this issue, you can wrap the variable in an <code>identity()</code> call as we mentioned above, as this will bypass column names. However, a more explicit and general method that works in all dplyr verbs is to unquote the variable with the <code>!!</code> operator. This tells dplyr [...]
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Let's create a new `vars` column:</span>
+flights<span class="op">$</span>vars <-<span class="st"> </span>flights<span class="op">$</span>year
+
+<span class="co"># The new column won't be an issue if you evaluate `vars` in the</span>
+<span class="co"># context with the `!!` operator:</span>
+vars <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"year"</span>, <span class="st">"month"</span>, <span class="st">"day"</span>)
+<span class="kw">select</span>(flights, <span class="op">!!</span><span class="st"> </span>vars)
+<span class="co">#> # A tibble: 336,776 x 3</span>
+<span class="co">#> year month day</span>
+<span class="co">#> <int> <int> <int></span>
+<span class="co">#> 1 2013 1 1</span>
+<span class="co">#> 2 2013 1 1</span>
+<span class="co">#> 3 2013 1 1</span>
+<span class="co">#> 4 2013 1 1</span>
+<span class="co">#> # ... with 336,772 more rows</span></code></pre></div>
+<p>This operator is very useful when you need to use dplyr within custom functions. You can learn more about it in <code>vignette("programming")</code>. However it is important to understand the semantics of the verbs you are unquoting into, that is, the values they understand. As we have just seen, <code>select()</code> supports names and positions of columns. But that won’t be the case in other verbs like <code>mutate()</code> because they have different semantics.</p>
+</div>
+<div id="mutating-operations" class="section level3">
+<h3>Mutating operations</h3>
+<p>Mutate semantics are quite different from selection semantics. Whereas <code>select()</code> expects column names or positions, <code>mutate()</code> expects <em>column vectors</em>. Let’s create a smaller tibble for clarity:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df <-<span class="st"> </span><span class="kw">select</span>(flights, year<span class="op">:</span>dep_time)</code></pre></div>
+<p>When we use <code>select()</code>, the bare column names stand for ther own positions in the tibble. For <code>mutate()</code> on the other hand, column symbols represent the actual column vectors stored in the tibble. Consider what happens if we give a string or a number to <code>mutate()</code>:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">mutate</span>(df, <span class="st">"year"</span>, <span class="dv">2</span>)
+<span class="co">#> # A tibble: 336,776 x 6</span>
+<span class="co">#> year month day dep_time `"year"` `2`</span>
+<span class="co">#> <int> <int> <int> <int> <chr> <dbl></span>
+<span class="co">#> 1 2013 1 1 517 year 2.00</span>
+<span class="co">#> 2 2013 1 1 533 year 2.00</span>
+<span class="co">#> 3 2013 1 1 542 year 2.00</span>
+<span class="co">#> 4 2013 1 1 544 year 2.00</span>
+<span class="co">#> # ... with 336,772 more rows</span></code></pre></div>
+<p><code>mutate()</code> gets length-1 vectors that it interprets as new columns in the data frame. These vectors are recycled so they match the number of rows. That’s why it doesn’t make sense to supply expressions like <code>"year" + 10</code> to <code>mutate()</code>. This amounts to adding 10 to a string! The correct expression is:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">mutate</span>(df, year <span class="op">+</span><span class="st"> </span><span class="dv">10</span>)
+<span class="co">#> # A tibble: 336,776 x 5</span>
+<span class="co">#> year month day dep_time `year + 10`</span>
+<span class="co">#> <int> <int> <int> <int> <dbl></span>
+<span class="co">#> 1 2013 1 1 517 2023</span>
+<span class="co">#> 2 2013 1 1 533 2023</span>
+<span class="co">#> 3 2013 1 1 542 2023</span>
+<span class="co">#> 4 2013 1 1 544 2023</span>
+<span class="co">#> # ... with 336,772 more rows</span></code></pre></div>
+<p>In the same way, you can unquote values from the context if these values represent a valid column. They must be either length 1 (they then get recycled) or have the same length as the number of rows. In the following example we create a new vector that we add to the data frame:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">var <-<span class="st"> </span><span class="kw">seq</span>(<span class="dv">1</span>, <span class="kw">nrow</span>(df))
+<span class="kw">mutate</span>(df, <span class="dt">new =</span> var)
+<span class="co">#> # A tibble: 336,776 x 5</span>
+<span class="co">#> year month day dep_time new</span>
+<span class="co">#> <int> <int> <int> <int> <int></span>
+<span class="co">#> 1 2013 1 1 517 1</span>
+<span class="co">#> 2 2013 1 1 533 2</span>
+<span class="co">#> 3 2013 1 1 542 3</span>
+<span class="co">#> 4 2013 1 1 544 4</span>
+<span class="co">#> # ... with 336,772 more rows</span></code></pre></div>
+<p>A case in point is <code>group_by()</code>. While you might think it has select semantics, it actually has mutate semantics. This is quite handy as it allows to group by a modified column:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">group_by</span>(df, month)
+<span class="co">#> # A tibble: 336,776 x 4</span>
+<span class="co">#> # Groups: month [12]</span>
+<span class="co">#> year month day dep_time</span>
+<span class="co">#> <int> <int> <int> <int></span>
+<span class="co">#> 1 2013 1 1 517</span>
+<span class="co">#> 2 2013 1 1 533</span>
+<span class="co">#> 3 2013 1 1 542</span>
+<span class="co">#> 4 2013 1 1 544</span>
+<span class="co">#> # ... with 336,772 more rows</span>
+<span class="kw">group_by</span>(df, <span class="dt">month =</span> <span class="kw">as.factor</span>(month))
+<span class="co">#> # A tibble: 336,776 x 4</span>
+<span class="co">#> # Groups: month [12]</span>
+<span class="co">#> year month day dep_time</span>
+<span class="co">#> <int> <fctr> <int> <int></span>
+<span class="co">#> 1 2013 1 1 517</span>
+<span class="co">#> 2 2013 1 1 533</span>
+<span class="co">#> 3 2013 1 1 542</span>
+<span class="co">#> 4 2013 1 1 544</span>
+<span class="co">#> # ... with 336,772 more rows</span>
+<span class="kw">group_by</span>(df, <span class="dt">day_binned =</span> <span class="kw">cut</span>(day, <span class="dv">3</span>))
+<span class="co">#> # A tibble: 336,776 x 5</span>
+<span class="co">#> # Groups: day_binned [3]</span>
+<span class="co">#> year month day dep_time day_binned</span>
+<span class="co">#> <int> <int> <int> <int> <fctr> </span>
+<span class="co">#> 1 2013 1 1 517 (0.97,11] </span>
+<span class="co">#> 2 2013 1 1 533 (0.97,11] </span>
+<span class="co">#> 3 2013 1 1 542 (0.97,11] </span>
+<span class="co">#> 4 2013 1 1 544 (0.97,11] </span>
+<span class="co">#> # ... with 336,772 more rows</span></code></pre></div>
+<p>This is why you can’t supply a column name to <code>group_by()</code>. This amounts to creating a new column containing the string recycled to the number of rows:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">group_by</span>(df, <span class="st">"month"</span>)
+<span class="co">#> # A tibble: 336,776 x 5</span>
+<span class="co">#> # Groups: "month" [1]</span>
+<span class="co">#> year month day dep_time `"month"`</span>
+<span class="co">#> <int> <int> <int> <int> <chr> </span>
+<span class="co">#> 1 2013 1 1 517 month </span>
+<span class="co">#> 2 2013 1 1 533 month </span>
+<span class="co">#> 3 2013 1 1 542 month </span>
+<span class="co">#> 4 2013 1 1 544 month </span>
+<span class="co">#> # ... with 336,772 more rows</span></code></pre></div>
+<p>Since grouping with select semantics can be sometimes useful as well, we have added the <code>group_by_at()</code> variant. In dplyr, variants suffixed with <code>_at()</code> support selection semantics in their second argument. You just need to wrap the selection with <code>vars()</code>:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">group_by_at</span>(df, <span class="kw">vars</span>(year<span class="op">:</span>day))
+<span class="co">#> # A tibble: 336,776 x 4</span>
+<span class="co">#> # Groups: year, month, day [365]</span>
+<span class="co">#> year month day dep_time</span>
+<span class="co">#> <int> <int> <int> <int></span>
+<span class="co">#> 1 2013 1 1 517</span>
+<span class="co">#> 2 2013 1 1 533</span>
+<span class="co">#> 3 2013 1 1 542</span>
+<span class="co">#> 4 2013 1 1 544</span>
+<span class="co">#> # ... with 336,772 more rows</span></code></pre></div>
+<p>You can read more about the <code>_at()</code> and <code>_if()</code> variants in the <code>?scoped</code> help page.</p>
+</div>
+</div>
+<div id="piping" class="section level2">
+<h2>Piping</h2>
+<p>The dplyr API is functional in the sense that function calls don’t have side-effects. You must always save their results. This doesn’t lead to particularly elegant code, especially if you want to do many operations at once. You either have to do it step-by-step:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">a1 <-<span class="st"> </span><span class="kw">group_by</span>(flights, year, month, day)
+a2 <-<span class="st"> </span><span class="kw">select</span>(a1, arr_delay, dep_delay)
+a3 <-<span class="st"> </span><span class="kw">summarise</span>(a2,
+ <span class="dt">arr =</span> <span class="kw">mean</span>(arr_delay, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
+ <span class="dt">dep =</span> <span class="kw">mean</span>(dep_delay, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>))
+a4 <-<span class="st"> </span><span class="kw">filter</span>(a3, arr <span class="op">></span><span class="st"> </span><span class="dv">30</span> <span class="op">|</span><span class="st"> </span>dep <span class="op">></span><span class="st"> </span><span class="dv">30</span>)</code></pre></div>
+<p>Or if you don’t want to name the intermediate results, you need to wrap the function calls inside each other:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">filter</span>(
+ <span class="kw">summarise</span>(
+ <span class="kw">select</span>(
+ <span class="kw">group_by</span>(flights, year, month, day),
+ arr_delay, dep_delay
+ ),
+ <span class="dt">arr =</span> <span class="kw">mean</span>(arr_delay, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
+ <span class="dt">dep =</span> <span class="kw">mean</span>(dep_delay, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>)
+ ),
+ arr <span class="op">></span><span class="st"> </span><span class="dv">30</span> <span class="op">|</span><span class="st"> </span>dep <span class="op">></span><span class="st"> </span><span class="dv">30</span>
+)
+<span class="co">#> Adding missing grouping variables: `year`, `month`, `day`</span>
+<span class="co">#> # A tibble: 49 x 5</span>
+<span class="co">#> # Groups: year, month [11]</span>
+<span class="co">#> year month day arr dep</span>
+<span class="co">#> <int> <int> <int> <dbl> <dbl></span>
+<span class="co">#> 1 2013 1 16 34.2 24.6</span>
+<span class="co">#> 2 2013 1 31 32.6 28.7</span>
+<span class="co">#> 3 2013 2 11 36.3 39.1</span>
+<span class="co">#> 4 2013 2 27 31.3 37.8</span>
+<span class="co">#> # ... with 45 more rows</span></code></pre></div>
+<p>This is difficult to read because the order of the operations is from inside to out. Thus, the arguments are a long way away from the function. To get around this problem, dplyr provides the <code>%>%</code> operator from magrittr. <code>x %>% f(y)</code> turns into <code>f(x, y)</code> so you can use it to rewrite multiple operations that you can read left-to-right, top-to-bottom:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">group_by</span>(year, month, day) <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">select</span>(arr_delay, dep_delay) <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">summarise</span>(
+ <span class="dt">arr =</span> <span class="kw">mean</span>(arr_delay, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
+ <span class="dt">dep =</span> <span class="kw">mean</span>(dep_delay, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>)
+ ) <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">filter</span>(arr <span class="op">></span><span class="st"> </span><span class="dv">30</span> <span class="op">|</span><span class="st"> </span>dep <span class="op">></span><span class="st"> </span><span class="dv">30</span>)</code></pre></div>
+</div>
+<div id="other-data-sources" class="section level2">
+<h2>Other data sources</h2>
+<p>As well as data frames, dplyr works with data that is stored in other ways, like data tables, databases and multidimensional arrays.</p>
+<div id="data-table" class="section level3">
+<h3>Data table</h3>
+<p>dplyr also provides <a href="http://datatable.r-forge.r-project.org/">data table</a> methods for all verbs through <a href="http://github.com/hadley/dtplyr">dtplyr</a>. If you’re using data.tables already this lets you to use dplyr syntax for data manipulation, and data.table for everything else.</p>
+<p>For multiple operations, data.table can be faster because you usually use it with multiple verbs simultaneously. For example, with data table you can do a mutate and a select in a single step. It’s smart enough to know that there’s no point in computing the new variable for rows you’re about to throw away.</p>
+<p>The advantages of using dplyr with data tables are:</p>
+<ul>
+<li><p>For common data manipulation tasks, it insulates you from the reference semantics of data.tables, and protects you from accidentally modifying your data.</p></li>
+<li><p>Instead of one complex method built on the subscripting operator (<code>[</code>), it provides many simple methods.</p></li>
+</ul>
+</div>
+<div id="databases" class="section level3">
+<h3>Databases</h3>
+<p>dplyr also allows you to use the same verbs with a remote database. It takes care of generating the SQL for you so that you can avoid the cognitive challenge of constantly switching between languages. To use these capabilities, you’ll need to install the dbplyr package and then read <code>vignette("dbplyr")</code> for the details.</p>
+</div>
+<div id="multidimensional-arrays-cubes" class="section level3">
+<h3>Multidimensional arrays / cubes</h3>
+<p><code>tbl_cube()</code> provides an experimental interface to multidimensional arrays or data cubes. If you’re using this form of data in R, please get in touch so I can better understand your needs.</p>
+</div>
+</div>
+<div id="comparisons" class="section level2">
+<h2>Comparisons</h2>
+<p>Compared to all existing options, dplyr:</p>
+<ul>
+<li><p>abstracts away how your data is stored, so that you can work with data frames, data tables and remote databases using the same set of functions. This lets you focus on what you want to achieve, not on the logistics of data storage.</p></li>
+<li><p>provides a thoughtful default <code>print()</code> method that doesn’t automatically print pages of data to the screen (this was inspired by data table’s output).</p></li>
+</ul>
+<p>Compared to base functions:</p>
+<ul>
+<li><p>dplyr is much more consistent; functions have the same interface. So once you’ve mastered one, you can easily pick up the others</p></li>
+<li><p>base functions tend to be based around vectors; dplyr is based around data frames</p></li>
+</ul>
+<p>Compared to plyr, dplyr:</p>
+<ul>
+<li><p>is much much faster</p></li>
+<li><p>provides a better thought out set of joins</p></li>
+<li><p>only provides tools for working with data frames (e.g. most of dplyr is equivalent to <code>ddply()</code> + various functions, <code>do()</code> is equivalent to <code>dlply()</code>)</p></li>
+</ul>
+<p>Compared to virtual data frame approaches:</p>
+<ul>
+<li><p>it doesn’t pretend that you have a data frame: if you want to run lm etc, you’ll still need to manually pull down the data</p></li>
+<li><p>it doesn’t provide methods for R summary functions (e.g. <code>mean()</code>, or <code>sum()</code>)</p></li>
+</ul>
+</div>
+
+
+
+<!-- dynamically load mathjax for compatibility with self-contained -->
+<script>
+ (function () {
+ var script = document.createElement("script");
+ script.type = "text/javascript";
+ script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
+ document.getElementsByTagName("head")[0].appendChild(script);
+ })();
+</script>
+
+</body>
+</html>
diff --git a/inst/doc/hybrid-evaluation.R b/inst/doc/hybrid-evaluation.R
deleted file mode 100644
index fff9a21..0000000
--- a/inst/doc/hybrid-evaluation.R
+++ /dev/null
@@ -1,12 +0,0 @@
-## ---- echo = FALSE, message = FALSE--------------------------------------
-knitr::opts_chunk$set(collapse = T, comment = "#>")
-options(tibble.print_min = 4L, tibble.print_max = 4L)
-library(dplyr)
-
-## ---- eval = FALSE-------------------------------------------------------
-# summarise(per_day, flights = sum(flights))
-
-## ---- eval=FALSE---------------------------------------------------------
-# foo <- function(x) x*x
-# summarise(per_day, flights = foo(sum(flights)) )
-
diff --git a/inst/doc/introduction.html b/inst/doc/introduction.html
deleted file mode 100644
index f4d2df5..0000000
--- a/inst/doc/introduction.html
+++ /dev/null
@@ -1,570 +0,0 @@
-<!DOCTYPE html>
-
-<html xmlns="http://www.w3.org/1999/xhtml">
-
-<head>
-
-<meta charset="utf-8">
-<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
-<meta name="generator" content="pandoc" />
-
-<meta name="viewport" content="width=device-width, initial-scale=1">
-
-
-<meta name="date" content="2016-06-23" />
-
-<title>Introduction to dplyr</title>
-
-
-
-<style type="text/css">code{white-space: pre;}</style>
-<style type="text/css">
-div.sourceCode { overflow-x: auto; }
-table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode {
- margin: 0; padding: 0; vertical-align: baseline; border: none; }
-table.sourceCode { width: 100%; line-height: 100%; }
-td.lineNumbers { text-align: right; padding-right: 4px; padding-left: 4px; color: #aaaaaa; border-right: 1px solid #aaaaaa; }
-td.sourceCode { padding-left: 5px; }
-code > span.kw { color: #007020; font-weight: bold; } /* Keyword */
-code > span.dt { color: #902000; } /* DataType */
-code > span.dv { color: #40a070; } /* DecVal */
-code > span.bn { color: #40a070; } /* BaseN */
-code > span.fl { color: #40a070; } /* Float */
-code > span.ch { color: #4070a0; } /* Char */
-code > span.st { color: #4070a0; } /* String */
-code > span.co { color: #60a0b0; font-style: italic; } /* Comment */
-code > span.ot { color: #007020; } /* Other */
-code > span.al { color: #ff0000; font-weight: bold; } /* Alert */
-code > span.fu { color: #06287e; } /* Function */
-code > span.er { color: #ff0000; font-weight: bold; } /* Error */
-code > span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
-code > span.cn { color: #880000; } /* Constant */
-code > span.sc { color: #4070a0; } /* SpecialChar */
-code > span.vs { color: #4070a0; } /* VerbatimString */
-code > span.ss { color: #bb6688; } /* SpecialString */
-code > span.im { } /* Import */
-code > span.va { color: #19177c; } /* Variable */
-code > span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
-code > span.op { color: #666666; } /* Operator */
-code > span.bu { } /* BuiltIn */
-code > span.ex { } /* Extension */
-code > span.pp { color: #bc7a00; } /* Preprocessor */
-code > span.at { color: #7d9029; } /* Attribute */
-code > span.do { color: #ba2121; font-style: italic; } /* Documentation */
-code > span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
-code > span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
-code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
-</style>
-
-
-
-<link href="data:text/css;charset=utf-8,body%20%7B%0Abackground%2Dcolor%3A%20%23fff%3B%0Amargin%3A%201em%20auto%3B%0Amax%2Dwidth%3A%20700px%3B%0Aoverflow%3A%20visible%3B%0Apadding%2Dleft%3A%202em%3B%0Apadding%2Dright%3A%202em%3B%0Afont%2Dfamily%3A%20%22Open%20Sans%22%2C%20%22Helvetica%20Neue%22%2C%20Helvetica%2C%20Arial%2C%20sans%2Dserif%3B%0Afont%2Dsize%3A%2014px%3B%0Aline%2Dheight%3A%201%2E35%3B%0A%7D%0A%23header%20%7B%0Atext%2Dalign%3A%20center%3B%0A%7D%0A%23TOC%20%7B%0Aclear%3A%20bot [...]
-
-</head>
-
-<body>
-
-
-
-
-<h1 class="title toc-ignore">Introduction to dplyr</h1>
-<h4 class="date"><em>2016-06-23</em></h4>
-
-
-
-<p>When working with data you must:</p>
-<ul>
-<li><p>Figure out what you want to do.</p></li>
-<li><p>Describe those tasks in the form of a computer program.</p></li>
-<li><p>Execute the program.</p></li>
-</ul>
-<p>The dplyr package makes these steps fast and easy:</p>
-<ul>
-<li><p>By constraining your options, it simplifies how you can think about common data manipulation tasks.</p></li>
-<li><p>It provides simple “verbs”, functions that correspond to the most common data manipulation tasks, to help you translate those thoughts into code.</p></li>
-<li><p>It uses efficient data storage backends, so you spend less time waiting for the computer.</p></li>
-</ul>
-<p>This document introduces you to dplyr’s basic set of tools, and shows you how to apply them to data frames. Other vignettes provide more details on specific topics:</p>
-<ul>
-<li><p>databases: Besides in-memory data frames, dplyr also connects to out-of-memory, remote databases. And by translating your R code into the appropriate SQL, it allows you to work with both types of data using the same set of tools.</p></li>
-<li><p>benchmark-baseball: see how dplyr compares to other tools for data manipulation on a realistic use case.</p></li>
-<li><p>window-functions: a window function is a variation on an aggregation function. Where an aggregate function uses <code>n</code> inputs to produce 1 output, a window function uses <code>n</code> inputs to produce <code>n</code> outputs.</p></li>
-</ul>
-<div id="data-nycflights13" class="section level2">
-<h2>Data: nycflights13</h2>
-<p>To explore the basic data manipulation verbs of dplyr, we’ll start with the built in <code>nycflights13</code> data frame. This dataset contains all 336776 flights that departed from New York City in 2013. The data comes from the US <a href="http://www.transtats.bts.gov/DatabaseInfo.asp?DB_ID=120&Link=0">Bureau of Transportation Statistics</a>, and is documented in <code>?nycflights13</code></p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(nycflights13)
-<span class="kw">dim</span>(flights)
-<span class="co">#> [1] 336776 19</span>
-<span class="kw">head</span>(flights)
-<span class="co">#> # A tibble: 6 x 19</span>
-<span class="co">#> year month day dep_time sched_dep_time dep_delay arr_time</span>
-<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int></span>
-<span class="co">#> 1 2013 1 1 517 515 2 830</span>
-<span class="co">#> 2 2013 1 1 533 529 4 850</span>
-<span class="co">#> 3 2013 1 1 542 540 2 923</span>
-<span class="co">#> 4 2013 1 1 544 545 -1 1004</span>
-<span class="co">#> ... with 2 more rows, and 12 more variables: sched_arr_time <int>,</span>
-<span class="co">#> arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,</span>
-<span class="co">#> origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,</span>
-<span class="co">#> minute <dbl>, time_hour <time></span></code></pre></div>
-<p>dplyr can work with data frames as is, but if you’re dealing with large data, it’s worthwhile to convert them to a <code>tbl_df</code>: this is a wrapper around a data frame that won’t accidentally print a lot of data to the screen.</p>
-</div>
-<div id="single-table-verbs" class="section level2">
-<h2>Single table verbs</h2>
-<p>Dplyr aims to provide a function for each basic verb of data manipulation:</p>
-<ul>
-<li><code>filter()</code> (and <code>slice()</code>)</li>
-<li><code>arrange()</code></li>
-<li><code>select()</code> (and <code>rename()</code>)</li>
-<li><code>distinct()</code></li>
-<li><code>mutate()</code> (and <code>transmute()</code>)</li>
-<li><code>summarise()</code></li>
-<li><code>sample_n()</code> (and <code>sample_frac()</code>)</li>
-</ul>
-<p>If you’ve used plyr before, many of these will be familar.</p>
-</div>
-<div id="filter-rows-with-filter" class="section level2">
-<h2>Filter rows with <code>filter()</code></h2>
-<p><code>filter()</code> allows you to select a subset of rows in a data frame. The first argument is the name of the data frame. The second and subsequent arguments are the expressions that filter the data frame:</p>
-<p>For example, we can select all flights on January 1st with:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">filter</span>(flights, month ==<span class="st"> </span><span class="dv">1</span>, day ==<span class="st"> </span><span class="dv">1</span>)
-<span class="co">#> # A tibble: 842 x 19</span>
-<span class="co">#> year month day dep_time sched_dep_time dep_delay arr_time</span>
-<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int></span>
-<span class="co">#> 1 2013 1 1 517 515 2 830</span>
-<span class="co">#> 2 2013 1 1 533 529 4 850</span>
-<span class="co">#> 3 2013 1 1 542 540 2 923</span>
-<span class="co">#> 4 2013 1 1 544 545 -1 1004</span>
-<span class="co">#> ... with 838 more rows, and 12 more variables: sched_arr_time <int>,</span>
-<span class="co">#> arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,</span>
-<span class="co">#> origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,</span>
-<span class="co">#> minute <dbl>, time_hour <time></span></code></pre></div>
-<p>This is equivalent to the more verbose code in base R:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights[flights$month ==<span class="st"> </span><span class="dv">1</span> &<span class="st"> </span>flights$day ==<span class="st"> </span><span class="dv">1</span>, ]</code></pre></div>
-<p><code>filter()</code> works similarly to <code>subset()</code> except that you can give it any number of filtering conditions, which are joined together with <code>&</code> (not <code>&&</code> which is easy to do accidentally!). You can also use other boolean operators:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">filter</span>(flights, month ==<span class="st"> </span><span class="dv">1</span> |<span class="st"> </span>month ==<span class="st"> </span><span class="dv">2</span>)</code></pre></div>
-<p>To select rows by position, use <code>slice()</code>:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">slice</span>(flights, <span class="dv">1</span>:<span class="dv">10</span>)
-<span class="co">#> # A tibble: 10 x 19</span>
-<span class="co">#> year month day dep_time sched_dep_time dep_delay arr_time</span>
-<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int></span>
-<span class="co">#> 1 2013 1 1 517 515 2 830</span>
-<span class="co">#> 2 2013 1 1 533 529 4 850</span>
-<span class="co">#> 3 2013 1 1 542 540 2 923</span>
-<span class="co">#> 4 2013 1 1 544 545 -1 1004</span>
-<span class="co">#> ... with 6 more rows, and 12 more variables: sched_arr_time <int>,</span>
-<span class="co">#> arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,</span>
-<span class="co">#> origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,</span>
-<span class="co">#> minute <dbl>, time_hour <time></span></code></pre></div>
-</div>
-<div id="arrange-rows-with-arrange" class="section level2">
-<h2>Arrange rows with <code>arrange()</code></h2>
-<p><code>arrange()</code> works similarly to <code>filter()</code> except that instead of filtering or selecting rows, it reorders them. It takes a data frame, and a set of column names (or more complicated expressions) to order by. If you provide more than one column name, each additional column will be used to break ties in the values of preceding columns:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">arrange</span>(flights, year, month, day)
-<span class="co">#> # A tibble: 336,776 x 19</span>
-<span class="co">#> year month day dep_time sched_dep_time dep_delay arr_time</span>
-<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int></span>
-<span class="co">#> 1 2013 1 1 517 515 2 830</span>
-<span class="co">#> 2 2013 1 1 533 529 4 850</span>
-<span class="co">#> 3 2013 1 1 542 540 2 923</span>
-<span class="co">#> 4 2013 1 1 544 545 -1 1004</span>
-<span class="co">#> ... with 336,772 more rows, and 12 more variables: sched_arr_time <int>,</span>
-<span class="co">#> arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,</span>
-<span class="co">#> origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,</span>
-<span class="co">#> minute <dbl>, time_hour <time></span></code></pre></div>
-<p>Use <code>desc()</code> to order a column in descending order:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">arrange</span>(flights, <span class="kw">desc</span>(arr_delay))
-<span class="co">#> # A tibble: 336,776 x 19</span>
-<span class="co">#> year month day dep_time sched_dep_time dep_delay arr_time</span>
-<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int></span>
-<span class="co">#> 1 2013 1 9 641 900 1301 1242</span>
-<span class="co">#> 2 2013 6 15 1432 1935 1137 1607</span>
-<span class="co">#> 3 2013 1 10 1121 1635 1126 1239</span>
-<span class="co">#> 4 2013 9 20 1139 1845 1014 1457</span>
-<span class="co">#> ... with 336,772 more rows, and 12 more variables: sched_arr_time <int>,</span>
-<span class="co">#> arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,</span>
-<span class="co">#> origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,</span>
-<span class="co">#> minute <dbl>, time_hour <time></span></code></pre></div>
-<p><code>dplyr::arrange()</code> works the same way as <code>plyr::arrange()</code>. It’s a straightforward wrapper around <code>order()</code> that requires less typing. The previous code is equivalent to:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights[<span class="kw">order</span>(flights$year, flights$month, flights$day), ]
-flights[<span class="kw">order</span>(flights$arr_delay, <span class="dt">decreasing =</span> <span class="ot">TRUE</span>), ] or flights[<span class="kw">order</span>(-flights$arr_delay), ]</code></pre></div>
-</div>
-<div id="select-columns-with-select" class="section level2">
-<h2>Select columns with <code>select()</code></h2>
-<p>Often you work with large datasets with many columns but only a few are actually of interest to you. <code>select()</code> allows you to rapidly zoom in on a useful subset using operations that usually only work on numeric variable positions:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Select columns by name</span>
-<span class="kw">select</span>(flights, year, month, day)
-<span class="co">#> # A tibble: 336,776 x 3</span>
-<span class="co">#> year month day</span>
-<span class="co">#> <int> <int> <int></span>
-<span class="co">#> 1 2013 1 1</span>
-<span class="co">#> 2 2013 1 1</span>
-<span class="co">#> 3 2013 1 1</span>
-<span class="co">#> 4 2013 1 1</span>
-<span class="co">#> ... with 336,772 more rows</span>
-<span class="co"># Select all columns between year and day (inclusive)</span>
-<span class="kw">select</span>(flights, year:day)
-<span class="co">#> # A tibble: 336,776 x 3</span>
-<span class="co">#> year month day</span>
-<span class="co">#> <int> <int> <int></span>
-<span class="co">#> 1 2013 1 1</span>
-<span class="co">#> 2 2013 1 1</span>
-<span class="co">#> 3 2013 1 1</span>
-<span class="co">#> 4 2013 1 1</span>
-<span class="co">#> ... with 336,772 more rows</span>
-<span class="co"># Select all columns except those from year to day (inclusive)</span>
-<span class="kw">select</span>(flights, -(year:day))
-<span class="co">#> # A tibble: 336,776 x 16</span>
-<span class="co">#> dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay</span>
-<span class="co">#> <int> <int> <dbl> <int> <int> <dbl></span>
-<span class="co">#> 1 517 515 2 830 819 11</span>
-<span class="co">#> 2 533 529 4 850 830 20</span>
-<span class="co">#> 3 542 540 2 923 850 33</span>
-<span class="co">#> 4 544 545 -1 1004 1022 -18</span>
-<span class="co">#> ... with 336,772 more rows, and 10 more variables: carrier <chr>,</span>
-<span class="co">#> flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,</span>
-<span class="co">#> distance <dbl>, hour <dbl>, minute <dbl>, time_hour <time></span></code></pre></div>
-<p>This function works similarly to the <code>select</code> argument in <code>base::subset()</code>. Because the dplyr philosophy is to have small functions that do one thing well, it’s its own function in dplyr.</p>
-<p>There are a number of helper functions you can use within <code>select()</code>, like <code>starts_with()</code>, <code>ends_with()</code>, <code>matches()</code> and <code>contains()</code>. These let you quickly match larger blocks of variables that meet some criterion. See <code>?select</code> for more details.</p>
-<p>You can rename variables with <code>select()</code> by using named arguments:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">select</span>(flights, <span class="dt">tail_num =</span> tailnum)
-<span class="co">#> # A tibble: 336,776 x 1</span>
-<span class="co">#> tail_num</span>
-<span class="co">#> <chr></span>
-<span class="co">#> 1 N14228</span>
-<span class="co">#> 2 N24211</span>
-<span class="co">#> 3 N619AA</span>
-<span class="co">#> 4 N804JB</span>
-<span class="co">#> ... with 336,772 more rows</span></code></pre></div>
-<p>But because <code>select()</code> drops all the variables not explicitly mentioned, it’s not that useful. Instead, use <code>rename()</code>:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">rename</span>(flights, <span class="dt">tail_num =</span> tailnum)
-<span class="co">#> # A tibble: 336,776 x 19</span>
-<span class="co">#> year month day dep_time sched_dep_time dep_delay arr_time</span>
-<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int></span>
-<span class="co">#> 1 2013 1 1 517 515 2 830</span>
-<span class="co">#> 2 2013 1 1 533 529 4 850</span>
-<span class="co">#> 3 2013 1 1 542 540 2 923</span>
-<span class="co">#> 4 2013 1 1 544 545 -1 1004</span>
-<span class="co">#> ... with 336,772 more rows, and 12 more variables: sched_arr_time <int>,</span>
-<span class="co">#> arr_delay <dbl>, carrier <chr>, flight <int>, tail_num <chr>,</span>
-<span class="co">#> origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,</span>
-<span class="co">#> minute <dbl>, time_hour <time></span></code></pre></div>
-</div>
-<div id="extract-distinct-unique-rows" class="section level2">
-<h2>Extract distinct (unique) rows</h2>
-<p>Use <code>distinct()</code>to find unique values in a table:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">distinct</span>(flights, tailnum)
-<span class="co">#> # A tibble: 4,044 x 1</span>
-<span class="co">#> tailnum</span>
-<span class="co">#> <chr></span>
-<span class="co">#> 1 N14228</span>
-<span class="co">#> 2 N24211</span>
-<span class="co">#> 3 N619AA</span>
-<span class="co">#> 4 N804JB</span>
-<span class="co">#> ... with 4,040 more rows</span>
-<span class="kw">distinct</span>(flights, origin, dest)
-<span class="co">#> # A tibble: 224 x 2</span>
-<span class="co">#> origin dest</span>
-<span class="co">#> <chr> <chr></span>
-<span class="co">#> 1 EWR IAH</span>
-<span class="co">#> 2 LGA IAH</span>
-<span class="co">#> 3 JFK MIA</span>
-<span class="co">#> 4 JFK BQN</span>
-<span class="co">#> ... with 220 more rows</span></code></pre></div>
-<p>(This is very similar to <code>base::unique()</code> but should be much faster.)</p>
-</div>
-<div id="add-new-columns-with-mutate" class="section level2">
-<h2>Add new columns with <code>mutate()</code></h2>
-<p>Besides selecting sets of existing columns, it’s often useful to add new columns that are functions of existing columns. This is the job of <code>mutate()</code>:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">mutate</span>(flights,
- <span class="dt">gain =</span> arr_delay -<span class="st"> </span>dep_delay,
- <span class="dt">speed =</span> distance /<span class="st"> </span>air_time *<span class="st"> </span><span class="dv">60</span>)
-<span class="co">#> # A tibble: 336,776 x 21</span>
-<span class="co">#> year month day dep_time sched_dep_time dep_delay arr_time</span>
-<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int></span>
-<span class="co">#> 1 2013 1 1 517 515 2 830</span>
-<span class="co">#> 2 2013 1 1 533 529 4 850</span>
-<span class="co">#> 3 2013 1 1 542 540 2 923</span>
-<span class="co">#> 4 2013 1 1 544 545 -1 1004</span>
-<span class="co">#> ... with 336,772 more rows, and 14 more variables: sched_arr_time <int>,</span>
-<span class="co">#> arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,</span>
-<span class="co">#> origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,</span>
-<span class="co">#> minute <dbl>, time_hour <time>, gain <dbl>, speed <dbl></span></code></pre></div>
-<p><code>dplyr::mutate()</code> works the same way as <code>plyr::mutate()</code> and similarly to <code>base::transform()</code>. The key difference between <code>mutate()</code> and <code>transform()</code> is that mutate allows you to refer to columns that you’ve just created:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">mutate</span>(flights,
- <span class="dt">gain =</span> arr_delay -<span class="st"> </span>dep_delay,
- <span class="dt">gain_per_hour =</span> gain /<span class="st"> </span>(air_time /<span class="st"> </span><span class="dv">60</span>)
-)
-<span class="co">#> # A tibble: 336,776 x 21</span>
-<span class="co">#> year month day dep_time sched_dep_time dep_delay arr_time</span>
-<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int></span>
-<span class="co">#> 1 2013 1 1 517 515 2 830</span>
-<span class="co">#> 2 2013 1 1 533 529 4 850</span>
-<span class="co">#> 3 2013 1 1 542 540 2 923</span>
-<span class="co">#> 4 2013 1 1 544 545 -1 1004</span>
-<span class="co">#> ... with 336,772 more rows, and 14 more variables: sched_arr_time <int>,</span>
-<span class="co">#> arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,</span>
-<span class="co">#> origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,</span>
-<span class="co">#> minute <dbl>, time_hour <time>, gain <dbl>, gain_per_hour <dbl></span></code></pre></div>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">transform</span>(flights,
- <span class="dt">gain =</span> arr_delay -<span class="st"> </span>delay,
- <span class="dt">gain_per_hour =</span> gain /<span class="st"> </span>(air_time /<span class="st"> </span><span class="dv">60</span>)
-)
-<span class="co">#> Error: object 'gain' not found</span></code></pre></div>
-<p>If you only want to keep the new variables, use <code>transmute()</code>:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">transmute</span>(flights,
- <span class="dt">gain =</span> arr_delay -<span class="st"> </span>dep_delay,
- <span class="dt">gain_per_hour =</span> gain /<span class="st"> </span>(air_time /<span class="st"> </span><span class="dv">60</span>)
-)
-<span class="co">#> # A tibble: 336,776 x 2</span>
-<span class="co">#> gain gain_per_hour</span>
-<span class="co">#> <dbl> <dbl></span>
-<span class="co">#> 1 9 2.378855</span>
-<span class="co">#> 2 16 4.229075</span>
-<span class="co">#> 3 31 11.625000</span>
-<span class="co">#> 4 -17 -5.573770</span>
-<span class="co">#> ... with 336,772 more rows</span></code></pre></div>
-</div>
-<div id="summarise-values-with-summarise" class="section level2">
-<h2>Summarise values with <code>summarise()</code></h2>
-<p>The last verb is <code>summarise()</code>. It collapses a data frame to a single row (this is exactly equivalent to <code>plyr::summarise()</code>):</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">summarise</span>(flights,
- <span class="dt">delay =</span> <span class="kw">mean</span>(dep_delay, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>))
-<span class="co">#> # A tibble: 1 x 1</span>
-<span class="co">#> delay</span>
-<span class="co">#> <dbl></span>
-<span class="co">#> 1 12.63907</span></code></pre></div>
-<p>Below, we’ll see how this verb can be very useful.</p>
-</div>
-<div id="randomly-sample-rows-with-sample_n-and-sample_frac" class="section level2">
-<h2>Randomly sample rows with <code>sample_n()</code> and <code>sample_frac()</code></h2>
-<p>You can use <code>sample_n()</code> and <code>sample_frac()</code> to take a random sample of rows: use <code>sample_n()</code> for a fixed number and <code>sample_frac()</code> for a fixed fraction.</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">sample_n</span>(flights, <span class="dv">10</span>)
-<span class="co">#> # A tibble: 10 x 19</span>
-<span class="co">#> year month day dep_time sched_dep_time dep_delay arr_time</span>
-<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int></span>
-<span class="co">#> 1 2013 7 8 2205 2019 106 103</span>
-<span class="co">#> 2 2013 9 12 1602 1545 17 NA</span>
-<span class="co">#> 3 2013 11 4 1459 1459 0 1642</span>
-<span class="co">#> 4 2013 10 25 1354 1350 4 1534</span>
-<span class="co">#> ... with 6 more rows, and 12 more variables: sched_arr_time <int>,</span>
-<span class="co">#> arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,</span>
-<span class="co">#> origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,</span>
-<span class="co">#> minute <dbl>, time_hour <time></span>
-<span class="kw">sample_frac</span>(flights, <span class="fl">0.01</span>)
-<span class="co">#> # A tibble: 3,368 x 19</span>
-<span class="co">#> year month day dep_time sched_dep_time dep_delay arr_time</span>
-<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int></span>
-<span class="co">#> 1 2013 5 14 850 850 0 1237</span>
-<span class="co">#> 2 2013 11 8 832 840 -8 1016</span>
-<span class="co">#> 3 2013 12 1 1155 1155 0 1309</span>
-<span class="co">#> 4 2013 1 1 929 925 4 1220</span>
-<span class="co">#> ... with 3,364 more rows, and 12 more variables: sched_arr_time <int>,</span>
-<span class="co">#> arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,</span>
-<span class="co">#> origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,</span>
-<span class="co">#> minute <dbl>, time_hour <time></span></code></pre></div>
-<p>Use <code>replace = TRUE</code> to perform a bootstrap sample. If needed, you can weight the sample with the <code>weight</code> argument.</p>
-</div>
-<div id="commonalities" class="section level2">
-<h2>Commonalities</h2>
-<p>You may have noticed that the syntax and function of all these verbs are very similar:</p>
-<ul>
-<li><p>The first argument is a data frame.</p></li>
-<li><p>The subsequent arguments describe what to do with the data frame. Notice that you can refer to columns in the data frame directly without using <code>$</code>.</p></li>
-<li><p>The result is a new data frame</p></li>
-</ul>
-<p>Together these properties make it easy to chain together multiple simple steps to achieve a complex result.</p>
-<p>These five functions provide the basis of a language of data manipulation. At the most basic level, you can only alter a tidy data frame in five useful ways: you can reorder the rows (<code>arrange()</code>), pick observations and variables of interest (<code>filter()</code> and <code>select()</code>), add new variables that are functions of existing variables (<code>mutate()</code>), or collapse many values to a summary (<code>summarise()</code>). The remainder of the language comes [...]
-</div>
-<div id="grouped-operations" class="section level1">
-<h1>Grouped operations</h1>
-<p>These verbs are useful on their own, but they become really powerful when you apply them to groups of observations within a dataset. In dplyr, you do this by with the <code>group_by()</code> function. It breaks down a dataset into specified groups of rows. When you then apply the verbs above on the resulting object they’ll be automatically applied “by group”. Most importantly, all this is achieved by using the same exact syntax you’d use with an ungrouped object.</p>
-<p>Grouping affects the verbs as follows:</p>
-<ul>
-<li><p>grouped <code>select()</code> is the same as ungrouped <code>select()</code>, except that grouping variables are always retained.</p></li>
-<li><p>grouped <code>arrange()</code> orders first by the grouping variables</p></li>
-<li><p><code>mutate()</code> and <code>filter()</code> are most useful in conjunction with window functions (like <code>rank()</code>, or <code>min(x) == x</code>). They are described in detail in <code>vignette("window-functions")</code>.</p></li>
-<li><p><code>sample_n()</code> and <code>sample_frac()</code> sample the specified number/fraction of rows in each group.</p></li>
-<li><p><code>slice()</code> extracts rows within each group.</p></li>
-<li><p><code>summarise()</code> is powerful and easy to understand, as described in more detail below.</p></li>
-</ul>
-<p>In the following example, we split the complete dataset into individual planes and then summarise each plane by counting the number of flights (<code>count = n()</code>) and computing the average distance (<code>dist = mean(Distance, na.rm = TRUE)</code>) and arrival delay (<code>delay = mean(ArrDelay, na.rm = TRUE)</code>). We then use ggplot2 to display the output.</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">by_tailnum <-<span class="st"> </span><span class="kw">group_by</span>(flights, tailnum)
-delay <-<span class="st"> </span><span class="kw">summarise</span>(by_tailnum,
- <span class="dt">count =</span> <span class="kw">n</span>(),
- <span class="dt">dist =</span> <span class="kw">mean</span>(distance, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
- <span class="dt">delay =</span> <span class="kw">mean</span>(arr_delay, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>))
-delay <-<span class="st"> </span><span class="kw">filter</span>(delay, count ><span class="st"> </span><span class="dv">20</span>, dist <<span class="st"> </span><span class="dv">2000</span>)
-
-<span class="co"># Interestingly, the average delay is only slightly related to the</span>
-<span class="co"># average distance flown by a plane.</span>
-<span class="kw">ggplot</span>(delay, <span class="kw">aes</span>(dist, delay)) +
-<span class="st"> </span><span class="kw">geom_point</span>(<span class="kw">aes</span>(<span class="dt">size =</span> count), <span class="dt">alpha =</span> <span class="dv">1</span>/<span class="dv">2</span>) +
-<span class="st"> </span><span class="kw">geom_smooth</span>() +
-<span class="st"> </span><span class="kw">scale_size_area</span>()</code></pre></div>
-<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAkAAAAEgCAYAAABchszxAAAEDWlDQ1BJQ0MgUHJvZmlsZQAAOI2NVV1oHFUUPrtzZyMkzlNsNIV0qD8NJQ2TVjShtLp/3d02bpZJNtoi6GT27s6Yyc44M7v9oU9FUHwx6psUxL+3gCAo9Q/bPrQvlQol2tQgKD60+INQ6Ium65k7M5lpurHeZe58853vnnvuuWfvBei5qliWkRQBFpquLRcy4nOHj4g9K5CEh6AXBqFXUR0rXalMAjZPC3e1W99Dwntf2dXd/p+tt0YdFSBxH2Kz5qgLiI8B8KdVy3YBevqRHz/qWh72Yui3MUDEL3q44WPXw3M+fo1pZuQs4tOIBVVTaoiXEI/MxfhGDPsxsNZfoE1q66ro5aJim3XdoLFw72H+n23BaIXzbcOnz5mfPoTvYVz7KzUl5+FRxEuqkp9G/Ajia [...]
-<p>You use <code>summarise()</code> with <strong>aggregate functions</strong>, which take a vector of values and return a single number. There are many useful examples of such functions in base R like <code>min()</code>, <code>max()</code>, <code>mean()</code>, <code>sum()</code>, <code>sd()</code>, <code>median()</code>, and <code>IQR()</code>. dplyr provides a handful of others:</p>
-<ul>
-<li><p><code>n()</code>: the number of observations in the current group</p></li>
-<li><p><code>n_distinct(x)</code>:the number of unique values in <code>x</code>.</p></li>
-<li><p><code>first(x)</code>, <code>last(x)</code> and <code>nth(x, n)</code> - these work similarly to <code>x[1]</code>, <code>x[length(x)]</code>, and <code>x[n]</code> but give you more control over the result if the value is missing.</p></li>
-</ul>
-<p>For example, we could use these to find the number of planes and the number of flights that go to each possible destination:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">destinations <-<span class="st"> </span><span class="kw">group_by</span>(flights, dest)
-<span class="kw">summarise</span>(destinations,
- <span class="dt">planes =</span> <span class="kw">n_distinct</span>(tailnum),
- <span class="dt">flights =</span> <span class="kw">n</span>()
-)
-<span class="co">#> # A tibble: 105 x 3</span>
-<span class="co">#> dest planes flights</span>
-<span class="co">#> <chr> <int> <int></span>
-<span class="co">#> 1 ABQ 108 254</span>
-<span class="co">#> 2 ACK 58 265</span>
-<span class="co">#> 3 ALB 172 439</span>
-<span class="co">#> 4 ANC 6 8</span>
-<span class="co">#> ... with 101 more rows</span></code></pre></div>
-<p>You can also use any function that you write yourself. For performance, dplyr provides optimised C++ versions of many of these functions. If you want to provide your own C++ function, see the hybrid-evaluation vignette for more details.</p>
-<p>When you group by multiple variables, each summary peels off one level of the grouping. That makes it easy to progressively roll-up a dataset:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">daily <-<span class="st"> </span><span class="kw">group_by</span>(flights, year, month, day)
-(per_day <-<span class="st"> </span><span class="kw">summarise</span>(daily, <span class="dt">flights =</span> <span class="kw">n</span>()))
-<span class="co">#> Source: local data frame [365 x 4]</span>
-<span class="co">#> Groups: year, month [?]</span>
-<span class="co">#> </span>
-<span class="co">#> # A tibble: 365 x 4</span>
-<span class="co">#> year month day flights</span>
-<span class="co">#> <int> <int> <int> <int></span>
-<span class="co">#> 1 2013 1 1 842</span>
-<span class="co">#> 2 2013 1 2 943</span>
-<span class="co">#> 3 2013 1 3 914</span>
-<span class="co">#> 4 2013 1 4 915</span>
-<span class="co">#> ... with 361 more rows</span>
-(per_month <-<span class="st"> </span><span class="kw">summarise</span>(per_day, <span class="dt">flights =</span> <span class="kw">sum</span>(flights)))
-<span class="co">#> Source: local data frame [12 x 3]</span>
-<span class="co">#> Groups: year [?]</span>
-<span class="co">#> </span>
-<span class="co">#> # A tibble: 12 x 3</span>
-<span class="co">#> year month flights</span>
-<span class="co">#> <int> <int> <int></span>
-<span class="co">#> 1 2013 1 27004</span>
-<span class="co">#> 2 2013 2 24951</span>
-<span class="co">#> 3 2013 3 28834</span>
-<span class="co">#> 4 2013 4 28330</span>
-<span class="co">#> ... with 8 more rows</span>
-(per_year <-<span class="st"> </span><span class="kw">summarise</span>(per_month, <span class="dt">flights =</span> <span class="kw">sum</span>(flights)))
-<span class="co">#> # A tibble: 1 x 2</span>
-<span class="co">#> year flights</span>
-<span class="co">#> <int> <int></span>
-<span class="co">#> 1 2013 336776</span></code></pre></div>
-<p>However you need to be careful when progressively rolling up summaries like this: it’s ok for sums and counts, but you need to think about weighting for means and variances (it’s not possible to do this exactly for medians).</p>
-<div id="chaining" class="section level2">
-<h2>Chaining</h2>
-<p>The dplyr API is functional in the sense that function calls don’t have side-effects. You must always save their results. This doesn’t lead to particularly elegant code, especially if you want to do many operations at once. You either have to do it step-by-step:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">a1 <-<span class="st"> </span><span class="kw">group_by</span>(flights, year, month, day)
-a2 <-<span class="st"> </span><span class="kw">select</span>(a1, arr_delay, dep_delay)
-a3 <-<span class="st"> </span><span class="kw">summarise</span>(a2,
- <span class="dt">arr =</span> <span class="kw">mean</span>(arr_delay, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
- <span class="dt">dep =</span> <span class="kw">mean</span>(dep_delay, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>))
-a4 <-<span class="st"> </span><span class="kw">filter</span>(a3, arr ><span class="st"> </span><span class="dv">30</span> |<span class="st"> </span>dep ><span class="st"> </span><span class="dv">30</span>)</code></pre></div>
-<p>Or if you don’t want to save the intermediate results, you need to wrap the function calls inside each other:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">filter</span>(
- <span class="kw">summarise</span>(
- <span class="kw">select</span>(
- <span class="kw">group_by</span>(flights, year, month, day),
- arr_delay, dep_delay
- ),
- <span class="dt">arr =</span> <span class="kw">mean</span>(arr_delay, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
- <span class="dt">dep =</span> <span class="kw">mean</span>(dep_delay, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>)
- ),
- arr ><span class="st"> </span><span class="dv">30</span> |<span class="st"> </span>dep ><span class="st"> </span><span class="dv">30</span>
-)
-<span class="co">#> Adding missing grouping variables: `year`, `month`, `day`</span>
-<span class="co">#> Source: local data frame [49 x 5]</span>
-<span class="co">#> Groups: year, month [11]</span>
-<span class="co">#> </span>
-<span class="co">#> # A tibble: 49 x 5</span>
-<span class="co">#> year month day arr dep</span>
-<span class="co">#> <int> <int> <int> <dbl> <dbl></span>
-<span class="co">#> 1 2013 1 16 34.24736 24.61287</span>
-<span class="co">#> 2 2013 1 31 32.60285 28.65836</span>
-<span class="co">#> 3 2013 2 11 36.29009 39.07360</span>
-<span class="co">#> 4 2013 2 27 31.25249 37.76327</span>
-<span class="co">#> ... with 45 more rows</span></code></pre></div>
-<p>This is difficult to read because the order of the operations is from inside to out. Thus, the arguments are a long way away from the function. To get around this problem, dplyr provides the <code>%>%</code> operator. <code>x %>% f(y)</code> turns into <code>f(x, y)</code> so you can use it to rewrite multiple operations that you can read left-to-right, top-to-bottom:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights %>%
-<span class="st"> </span><span class="kw">group_by</span>(year, month, day) %>%
-<span class="st"> </span><span class="kw">select</span>(arr_delay, dep_delay) %>%
-<span class="st"> </span><span class="kw">summarise</span>(
- <span class="dt">arr =</span> <span class="kw">mean</span>(arr_delay, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
- <span class="dt">dep =</span> <span class="kw">mean</span>(dep_delay, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>)
- ) %>%
-<span class="st"> </span><span class="kw">filter</span>(arr ><span class="st"> </span><span class="dv">30</span> |<span class="st"> </span>dep ><span class="st"> </span><span class="dv">30</span>)</code></pre></div>
-</div>
-</div>
-<div id="other-data-sources" class="section level1">
-<h1>Other data sources</h1>
-<p>As well as data frames, dplyr works with data that is stored in other ways, like data tables, databases and multidimensional arrays.</p>
-<div id="data-table" class="section level2">
-<h2>Data table</h2>
-<p>dplyr also provides <a href="http://datatable.r-forge.r-project.org/">data table</a> methods for all verbs through <a href="http://github.com/hadley/dtplyr">dtplyr</a>. If you’re using data.tables already this lets you to use dplyr syntax for data manipulation, and data.table for everything else.</p>
-<p>For multiple operations, data.table can be faster because you usually use it with multiple verbs simultaneously. For example, with data table you can do a mutate and a select in a single step. It’s smart enough to know that there’s no point in computing the new variable for rows you’re about to throw away.</p>
-<p>The advantages of using dplyr with data tables are:</p>
-<ul>
-<li><p>For common data manipulation tasks, it insulates you from the reference semantics of data.tables, and protects you from accidentally modifying your data.</p></li>
-<li><p>Instead of one complex method built on the subscripting operator (<code>[</code>), it provides many simple methods.</p></li>
-</ul>
-</div>
-<div id="databases" class="section level2">
-<h2>Databases</h2>
-<p>dplyr also allows you to use the same verbs with a remote database. It takes care of generating the SQL for you so that you can avoid the cognitive challenge of constantly switching between languages. See the databases vignette for more details.</p>
-<p>Compared to DBI and the database connection algorithms:</p>
-<ul>
-<li>it hides, as much as possible, the fact that you’re working with a remote database</li>
-<li>you don’t need to know any SQL (although it helps!)</li>
-<li>it abstracts over the many differences between the different DBI implementations</li>
-</ul>
-</div>
-<div id="multidimensional-arrays-cubes" class="section level2">
-<h2>Multidimensional arrays / cubes</h2>
-<p><code>tbl_cube()</code> provides an experimental interface to multidimensional arrays or data cubes. If you’re using this form of data in R, please get in touch so I can better understand your needs.</p>
-</div>
-</div>
-<div id="comparisons" class="section level1">
-<h1>Comparisons</h1>
-<p>Compared to all existing options, dplyr:</p>
-<ul>
-<li><p>abstracts away how your data is stored, so that you can work with data frames, data tables and remote databases using the same set of functions. This lets you focus on what you want to achieve, not on the logistics of data storage.</p></li>
-<li><p>provides a thoughtful default <code>print()</code> method that doesn’t automatically print pages of data to the screen (this was inspired by data table’s output).</p></li>
-</ul>
-<p>Compared to base functions:</p>
-<ul>
-<li><p>dplyr is much more consistent; functions have the same interface. So once you’ve mastered one, you can easily pick up the others</p></li>
-<li><p>base functions tend to be based around vectors; dplyr is based around data frames</p></li>
-</ul>
-<p>Compared to plyr, dplyr:</p>
-<ul>
-<li><p>is much much faster</p></li>
-<li><p>provides a better thought out set of joins</p></li>
-<li><p>only provides tools for working with data frames (e.g. most of dplyr is equivalent to <code>ddply()</code> + various functions, <code>do()</code> is equivalent to <code>dlply()</code>)</p></li>
-</ul>
-<p>Compared to virtual data frame approaches:</p>
-<ul>
-<li><p>it doesn’t pretend that you have a data frame: if you want to run lm etc, you’ll still need to manually pull down the data</p></li>
-<li><p>it doesn’t provide methods for R summary functions (e.g. <code>mean()</code>, or <code>sum()</code>)</p></li>
-</ul>
-</div>
-
-
-
-<!-- dynamically load mathjax for compatibility with self-contained -->
-<script>
- (function () {
- var script = document.createElement("script");
- script.type = "text/javascript";
- script.src = "https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
- document.getElementsByTagName("head")[0].appendChild(script);
- })();
-</script>
-
-</body>
-</html>
diff --git a/inst/doc/new-sql-backend.R b/inst/doc/new-sql-backend.R
deleted file mode 100644
index 3e0a1f3..0000000
--- a/inst/doc/new-sql-backend.R
+++ /dev/null
@@ -1,34 +0,0 @@
-## ---- echo = FALSE, message = FALSE--------------------------------------
-knitr::opts_chunk$set(collapse = T, comment = "#>")
-options(tibble.print_min = 4L, tibble.print_max = 4L)
-library(dplyr)
-
-## ---- eval = FALSE-------------------------------------------------------
-# src_postgres <- function(dbname = NULL, host = NULL, port = NULL, user = NULL,
-# password = NULL, ...) {
-#
-# con <- dbConnect(PostgreSQL(), host = host %||% "", dbname = dbname %||% "",
-# user = user, password = password %||% "", port = port %||% "", ...)
-#
-# src_sql("postgres", con)
-# }
-
-## ------------------------------------------------------------------------
-#' @export
-src_desc.src_postgres <- function(con) {
- info <- dbGetInfo(con)
- host <- if (info$host == "") "localhost" else info$host
-
- paste0("postgres ", info$serverVersion, " [", info$user, "@",
- host, ":", info$port, "/", info$dbname, "]")
-}
-
-## ---- eval = FALSE-------------------------------------------------------
-# tbl.src_mssql <- function(src, from, ...) {
-# tbl_sql("mssql", src = src, from = from, ...)
-# }
-
-## ---- eval = FALSE-------------------------------------------------------
-# copy_nycflights13(src_mssql(...))
-# copy_lahman(src_mssql(...))
-
diff --git a/inst/doc/new-sql-backend.Rmd b/inst/doc/new-sql-backend.Rmd
deleted file mode 100644
index 8d2f82f..0000000
--- a/inst/doc/new-sql-backend.Rmd
+++ /dev/null
@@ -1,123 +0,0 @@
----
-title: "Adding a new SQL backend"
-date: "`r Sys.Date()`"
-output: rmarkdown::html_vignette
-vignette: >
- %\VignetteIndexEntry{Adding a new SQL backend}
- %\VignetteEngine{knitr::rmarkdown}
- \usepackage[utf8]{inputenc}
----
-
-```{r, echo = FALSE, message = FALSE}
-knitr::opts_chunk$set(collapse = T, comment = "#>")
-options(tibble.print_min = 4L, tibble.print_max = 4L)
-library(dplyr)
-```
-
-This document describes how to add a new SQL backend to dplyr. To begin:
-
-* Ensure that you have a DBI compliant database backend. If not, you'll need
- to first create it by following the instructions in
- `vignette("backend", package = "DBI")`.
-
-* You'll need a working knowledge of S3. Make sure that you're
- [familiar with the basics](http://adv-r.had.co.nz/OO-essentials.html#s3)
- before you start.
-
-This document is still a work in progress, but it will hopefully get you started. If you're familiar with how your database and at least one other database that dplyr supports already, this should be reasonably simple task. However, it is possible that a new database backend may need new methods - I'm happy to add those as needed.
-
-## Create the src object
-
-Start by creating a new src function to represent the backend. Assuming we're going to create a src for postgres, you'd call it `src_postgres()`, and you'd follow the pattern of an existing src. A simplified version of `src_postgres()` is show below:
-
-```{r, eval = FALSE}
-src_postgres <- function(dbname = NULL, host = NULL, port = NULL, user = NULL,
- password = NULL, ...) {
-
- con <- dbConnect(PostgreSQL(), host = host %||% "", dbname = dbname %||% "",
- user = user, password = password %||% "", port = port %||% "", ...)
-
- src_sql("postgres", con)
-}
-```
-
-Use `src_sql()` to create a new S3 object with the correct structure. It must have a DBI connection, but it can store anything else that might be useful.
-
-Next, implement a method for `src_desc()` that briefly describes the source:
-
-```{r}
-#' @export
-src_desc.src_postgres <- function(con) {
- info <- dbGetInfo(con)
- host <- if (info$host == "") "localhost" else info$host
-
- paste0("postgres ", info$serverVersion, " [", info$user, "@",
- host, ":", info$port, "/", info$dbname, "]")
-}
-```
-
-If you read the source code for the real `src_postgres()` you'll notice that it caches the `getGetInfo()` field on creation, since this saves a little time when printing tbls.
-
-Before continuing, check that you can create a connection to a local database, and that you get a listing of the existing tables. If you have a problem at this point, you may need to check the DBI backend.
-
-## tbl
-
-Next implement the `tbl()` method for your data source. This will probably just be:
-
-```{r, eval = FALSE}
-tbl.src_mssql <- function(src, from, ...) {
- tbl_sql("mssql", src = src, from = from, ...)
-}
-```
-
-Before continuing, make sure you can connect to an existing table, and that the results are printed reasonably. If not, that may indicate your database uses a non-standard DBI interface, and you'll need to fix that before continuing.
-
-This is also a good time implement `explain()`, by adding a method for `db_explain()`.
-
-If your database uses non-standard quoting (i.e. something other than `"` for identifiers and `'` for strings), implement methods for `sql_escape_string()` and `sql_escape_ident()`.
-
-You may need to implement `db_query_fields()`, which should return a character vector giving the field names of a query.
-
-At this point, all the basic verbs (`summarise()`, `filter()`, `arrange()`, `mutate()` etc) should also work, but it's hard to test without some data.
-
-## `copy_to()`
-
-Next, implement the methods that power `copy_to()` work. Once you've implemented these methods, you'll be able copy datasets from R into your database, which will make testing much easier.
-
-* `db_data_type()`
-* `sql_begin()`, `sql_commit()`, `sql_rollback()`
-* `sql_create_table()`, `sql_insert_into()`, `sql_drop_table()`
-* `sql_create_index()`, `sql_analyze()`
-
-If the database doesn't support a function, just return `TRUE` without doing anything. If you find these methods a very poor match to your backend, you may find it easier to provide a direct `copy_to()` method.
-
-At this point, you should be able to copy the nycflights13 data packages into your database with (e.g.):
-
-```{r, eval = FALSE}
-copy_nycflights13(src_mssql(...))
-copy_lahman(src_mssql(...))
-```
-
-Don't proceed further until this works, and you've verified that the basic single table verbs word.
-
-## Compute, collect and collapse
-
-Next, check that `collapse()`, `compute()`, and `collect()` work.
-
-* If `collapse()` fails, your database has a non-standard way of constructing
- subqueries. Add a method for `sql_subquery()`.
-
-* If `compute()` fails, your database has a non-standard way of saving queries
- in temporary tables. Add a method for `db_save_query()`.
-
-## Multi table verbs
-
-Next check the multitable verbs:
-
-* `left_join()`, `inner_join()`: powered by `sql_join()`
-* `semi_join()`, `anti_join()`: powered by `sql_semi_join()`
-* `union()`, `intersect()`, `setdiff()`: powered by `sql_set_op()`
-
-## sql translation
-
-To finish off, you can add custom R -> SQL translation by providing a method for `src_translate_env()`. This function should return an object created by `sql_variant()`. See existing methods for examples.
diff --git a/inst/doc/nse.R b/inst/doc/nse.R
deleted file mode 100644
index e2f2d3d..0000000
--- a/inst/doc/nse.R
+++ /dev/null
@@ -1,51 +0,0 @@
-## ---- echo = FALSE, message = FALSE--------------------------------------
-knitr::opts_chunk$set(collapse = T, comment = "#>")
-options(tibble.print_min = 4L, tibble.print_max = 4L)
-library(dplyr)
-
-## ------------------------------------------------------------------------
-# NSE version:
-summarise(mtcars, mean(mpg))
-
-# SE versions:
-summarise_(mtcars, ~mean(mpg))
-summarise_(mtcars, quote(mean(mpg)))
-summarise_(mtcars, "mean(mpg)")
-
-## ------------------------------------------------------------------------
-constant1 <- function(n) ~n
-summarise_(mtcars, constant1(4))
-
-## ------------------------------------------------------------------------
-n <- 10
-dots <- list(~mean(mpg), ~n)
-summarise_(mtcars, .dots = dots)
-
-summarise_(mtcars, .dots = setNames(dots, c("mean", "count")))
-
-## ------------------------------------------------------------------------
-library(lazyeval)
-# Interp works with formulas, quoted calls and strings (but formulas are best)
-interp(~ x + y, x = 10)
-interp(quote(x + y), x = 10)
-interp("x + y", x = 10)
-
-# Use as.name if you have a character string that gives a variable name
-interp(~ mean(var), var = as.name("mpg"))
-# or supply the quoted name directly
-interp(~ mean(var), var = quote(mpg))
-
-## ------------------------------------------------------------------------
-interp(~ f(a, b), f = quote(mean))
-interp(~ f(a, b), f = as.name("+"))
-interp(~ f(a, b), f = quote(`if`))
-
-## ------------------------------------------------------------------------
-interp(~ x + y, .values = list(x = 10))
-
-# You can also interpolate variables defined in the current
-# environment, but this is a little risky becuase it's easy
-# for this to change without you realising
-y <- 10
-interp(~ x + y, .values = environment())
-
diff --git a/inst/doc/nse.Rmd b/inst/doc/nse.Rmd
deleted file mode 100644
index 44d0804..0000000
--- a/inst/doc/nse.Rmd
+++ /dev/null
@@ -1,102 +0,0 @@
----
-title: "Non-standard evaluation"
-date: "`r Sys.Date()`"
-output: rmarkdown::html_vignette
-vignette: >
- %\VignetteIndexEntry{Non-standard evaluation}
- %\VignetteEngine{knitr::rmarkdown}
- %\usepackage[utf8]{inputenc}
----
-
-```{r, echo = FALSE, message = FALSE}
-knitr::opts_chunk$set(collapse = T, comment = "#>")
-options(tibble.print_min = 4L, tibble.print_max = 4L)
-library(dplyr)
-```
-
-Dplyr uses non-standard evaluation (NSE) in all the important single table verbs: `filter()`, `mutate()`, `summarise()`, `arrange()`, `select()` and `group_by()`. NSE is important not only because it reduces typing; for database backends, it's what makes it possible to translate R code into SQL. However, while NSE is great for interactive use it's hard to program with. This vignette describes how you can opt out of NSE in dplyr, and instead (with a little quoting) rely only on standard e [...]
-
-Behind the scenes, NSE is powered by the [lazyeval](https://github.com/hadley/lazyeval) package. The goal is to provide an approach to NSE that you can learn once and then apply in many places (dplyr is the first of my packages to use this approach, but over time I will implement it everywhere). You may want to read the lazyeval vignettes, if you'd like to learn more about the underlying details, or if you'd like to use this approach in your own packages.
-
-## Standard evaluation basics
-
-Every function in dplyr that uses NSE also has a version that uses SE. The name of the SE version is always the NSE name with an `_` on the end. For example, the SE version of `summarise()` is `summarise_()`; the SE version of `arrange()` is `arrange_()`. These functions work very similarly to their NSE cousins, but their inputs must be "quoted":
-
-```{r}
-# NSE version:
-summarise(mtcars, mean(mpg))
-
-# SE versions:
-summarise_(mtcars, ~mean(mpg))
-summarise_(mtcars, quote(mean(mpg)))
-summarise_(mtcars, "mean(mpg)")
-```
-
-There are three ways to quote inputs that dplyr understands:
-
-* With a formula, `~ mean(mpg)`.
-* With `quote()`, `quote(mean(mpg))`.
-* As a string: `"mean(mpg)"`.
-
-It's best to use a formula because a formula captures both the expression to evaluate and the environment where the evaluation occurs. This is important if the expression is a mixture of variables in a data frame and objects in the local environment:
-
-```{r}
-constant1 <- function(n) ~n
-summarise_(mtcars, constant1(4))
-```
-
-```{r, error = TRUE, purl = FALSE}
-# Using anything other than a formula will fail because it doesn't
-# know which environment to look in
-constant2 <- function(n) quote(n)
-summarise_(mtcars, constant2(4))
-```
-
-## Setting variable names
-
-If you also want output variables to vary, you need to pass a list of quoted objects to the `.dots` argument:
-
-```{r}
-n <- 10
-dots <- list(~mean(mpg), ~n)
-summarise_(mtcars, .dots = dots)
-
-summarise_(mtcars, .dots = setNames(dots, c("mean", "count")))
-```
-
-## Mixing constants and variables
-
-What if you need to mingle constants and variables? Use the handy `lazyeval::interp()`:
-
-```{r}
-library(lazyeval)
-# Interp works with formulas, quoted calls and strings (but formulas are best)
-interp(~ x + y, x = 10)
-interp(quote(x + y), x = 10)
-interp("x + y", x = 10)
-
-# Use as.name if you have a character string that gives a variable name
-interp(~ mean(var), var = as.name("mpg"))
-# or supply the quoted name directly
-interp(~ mean(var), var = quote(mpg))
-```
-
-Because [every action in R is a function call](http://adv-r.had.co.nz/Functions.html#all-calls) you can use this same idea to modify functions:
-
-```{r}
-interp(~ f(a, b), f = quote(mean))
-interp(~ f(a, b), f = as.name("+"))
-interp(~ f(a, b), f = quote(`if`))
-```
-
-If you already have a list of values, use `.values`:
-
-```{r}
-interp(~ x + y, .values = list(x = 10))
-
-# You can also interpolate variables defined in the current
-# environment, but this is a little risky becuase it's easy
-# for this to change without you realising
-y <- 10
-interp(~ x + y, .values = environment())
-```
diff --git a/inst/doc/programming.R b/inst/doc/programming.R
new file mode 100644
index 0000000..134d93b
--- /dev/null
+++ b/inst/doc/programming.R
@@ -0,0 +1,314 @@
+## ----setup, echo = FALSE, message = FALSE--------------------------------
+knitr::opts_chunk$set(collapse = T, comment = "#>")
+options(tibble.print_min = 4L, tibble.print_max = 4L)
+library(dplyr)
+set.seed(1014)
+
+## ------------------------------------------------------------------------
+df <- tibble(x = 1:3, y = 3:1)
+filter(df, x == 1)
+
+## ---- error = TRUE-------------------------------------------------------
+my_var <- x
+filter(df, my_var == 1)
+
+## ---- error = TRUE-------------------------------------------------------
+my_var <- "x"
+filter(df, my_var == 1)
+
+## ---- eval = FALSE-------------------------------------------------------
+# df[df$x == df$y, ]
+# df[df$x == y, ]
+# df[x == df$y, ]
+# df[x == y, ]
+
+## ------------------------------------------------------------------------
+greet <- function(name) {
+ "How do you do, name?"
+}
+greet("Hadley")
+
+## ------------------------------------------------------------------------
+greet <- function(name) {
+ paste0("How do you do, ", name, "?")
+}
+greet("Hadley")
+
+## ------------------------------------------------------------------------
+greet <- function(name) {
+ glue::glue("How do you do, {name}?")
+}
+greet("Hadley")
+
+## ---- eval = FALSE-------------------------------------------------------
+# mutate(df1, y = a + x)
+# mutate(df2, y = a + x)
+# mutate(df3, y = a + x)
+# mutate(df4, y = a + x)
+
+## ------------------------------------------------------------------------
+mutate_y <- function(df) {
+ mutate(df, y = a + x)
+}
+
+## ------------------------------------------------------------------------
+df1 <- tibble(x = 1:3)
+a <- 10
+mutate_y(df1)
+
+## ---- error = TRUE-------------------------------------------------------
+mutate_y <- function(df) {
+ mutate(df, y = .data$a + .data$x)
+}
+
+mutate_y(df1)
+
+## ------------------------------------------------------------------------
+df <- tibble(
+ g1 = c(1, 1, 2, 2, 2),
+ g2 = c(1, 2, 1, 2, 1),
+ a = sample(5),
+ b = sample(5)
+)
+
+df %>%
+ group_by(g1) %>%
+ summarise(a = mean(a))
+
+df %>%
+ group_by(g2) %>%
+ summarise(a = mean(a))
+
+## ---- error = TRUE-------------------------------------------------------
+my_summarise <- function(df, group_var) {
+ df %>%
+ group_by(group_var) %>%
+ summarise(a = mean(a))
+}
+
+my_summarise(df, g1)
+
+## ---- error = TRUE-------------------------------------------------------
+my_summarise(df, "g2")
+
+## ------------------------------------------------------------------------
+quo(g1)
+quo(a + b + c)
+quo("a")
+
+## ---- error = TRUE-------------------------------------------------------
+my_summarise(df, quo(g1))
+
+## ------------------------------------------------------------------------
+my_summarise <- function(df, group_var) {
+ df %>%
+ group_by(!!group_var) %>%
+ summarise(a = mean(a))
+}
+
+my_summarise(df, quo(g1))
+
+## ---- eval = FALSE-------------------------------------------------------
+# my_summarise(df, g1)
+
+## ---- error = TRUE-------------------------------------------------------
+my_summarise <- function(df, group_var) {
+ quo_group_var <- quo(group_var)
+ print(quo_group_var)
+
+ df %>%
+ group_by(!!quo_group_var) %>%
+ summarise(a = mean(a))
+}
+
+my_summarise(df, g1)
+
+## ------------------------------------------------------------------------
+my_summarise <- function(df, group_var) {
+ group_var <- enquo(group_var)
+ print(group_var)
+
+ df %>%
+ group_by(!!group_var) %>%
+ summarise(a = mean(a))
+}
+
+my_summarise(df, g1)
+
+## ------------------------------------------------------------------------
+summarise(df, mean = mean(a), sum = sum(a), n = n())
+summarise(df, mean = mean(a * b), sum = sum(a * b), n = n())
+
+## ------------------------------------------------------------------------
+my_var <- quo(a)
+summarise(df, mean = mean(!!my_var), sum = sum(!!my_var), n = n())
+
+## ------------------------------------------------------------------------
+quo(summarise(df,
+ mean = mean(!!my_var),
+ sum = sum(!!my_var),
+ n = n()
+))
+
+## ------------------------------------------------------------------------
+my_summarise2 <- function(df, expr) {
+ expr <- enquo(expr)
+
+ summarise(df,
+ mean = mean(!!expr),
+ sum = sum(!!expr),
+ n = n()
+ )
+}
+my_summarise2(df, a)
+my_summarise2(df, a * b)
+
+## ------------------------------------------------------------------------
+mutate(df, mean_a = mean(a), sum_a = sum(a))
+mutate(df, mean_b = mean(b), sum_b = sum(b))
+
+## ------------------------------------------------------------------------
+my_mutate <- function(df, expr) {
+ expr <- enquo(expr)
+ mean_name <- paste0("mean_", quo_name(expr))
+ sum_name <- paste0("sum_", quo_name(expr))
+
+ mutate(df,
+ !!mean_name := mean(!!expr),
+ !!sum_name := sum(!!expr)
+ )
+}
+
+my_mutate(df, a)
+
+## ------------------------------------------------------------------------
+my_summarise <- function(df, ...) {
+ group_var <- quos(...)
+
+ df %>%
+ group_by(!!!group_var) %>%
+ summarise(a = mean(a))
+}
+
+my_summarise(df, g1, g2)
+
+## ------------------------------------------------------------------------
+args <- list(na.rm = TRUE, trim = 0.25)
+quo(mean(x, !!! args))
+
+args <- list(quo(x), na.rm = TRUE, trim = 0.25)
+quo(mean(!!! args))
+
+## ------------------------------------------------------------------------
+disp ~ cyl + drat
+
+## ------------------------------------------------------------------------
+# Computing the value of the expression:
+toupper(letters[1:5])
+
+# Capturing the expression:
+quote(toupper(letters[1:5]))
+
+## ------------------------------------------------------------------------
+f <- function(x) {
+ quo(x)
+}
+
+x1 <- f(10)
+x2 <- f(100)
+
+## ------------------------------------------------------------------------
+x1
+x2
+
+## ---- message = FALSE----------------------------------------------------
+library(rlang)
+
+get_env(x1)
+get_env(x2)
+
+## ------------------------------------------------------------------------
+eval_tidy(x1)
+eval_tidy(x2)
+
+## ------------------------------------------------------------------------
+user_var <- 1000
+mtcars %>% summarise(cyl = mean(cyl) * user_var)
+
+## ------------------------------------------------------------------------
+typeof(mean)
+
+## ------------------------------------------------------------------------
+var <- ~toupper(letters[1:5])
+var
+
+# You can extract its expression:
+get_expr(var)
+
+# Or inspect its enclosure:
+get_env(var)
+
+## ------------------------------------------------------------------------
+# Here we capture `letters[1:5]` as an expression:
+quo(toupper(letters[1:5]))
+
+# Here we capture the value of `letters[1:5]`
+quo(toupper(!!letters[1:5]))
+quo(toupper(UQ(letters[1:5])))
+
+## ------------------------------------------------------------------------
+var1 <- quo(letters[1:5])
+quo(toupper(!!var1))
+
+## ------------------------------------------------------------------------
+my_mutate <- function(x) {
+ mtcars %>%
+ select(cyl) %>%
+ slice(1:4) %>%
+ mutate(cyl2 = cyl + (!! x))
+}
+
+f <- function(x) quo(x)
+expr1 <- f(100)
+expr2 <- f(10)
+
+my_mutate(expr1)
+my_mutate(expr2)
+
+## ---- error = TRUE-------------------------------------------------------
+my_fun <- quo(fun)
+quo(!!my_fun(x, y, z))
+quo(UQ(my_fun)(x, y, z))
+
+my_var <- quo(x)
+quo(filter(df, !!my_var == 1))
+quo(filter(df, UQ(my_var) == 1))
+
+## ------------------------------------------------------------------------
+quo(UQE(my_fun)(x, y, z))
+quo(filter(df, UQE(my_var) == 1))
+
+## ------------------------------------------------------------------------
+quo(list(!!! letters[1:5]))
+
+## ------------------------------------------------------------------------
+x <- list(foo = 1L, bar = quo(baz))
+quo(list(!!! x))
+
+## ------------------------------------------------------------------------
+args <- list(mean = quo(mean(cyl)), count = quo(n()))
+mtcars %>%
+ group_by(am) %>%
+ summarise(!!! args)
+
+## ------------------------------------------------------------------------
+mean_nm <- "mean"
+count_nm <- "count"
+
+mtcars %>%
+ group_by(am) %>%
+ summarise(
+ !!mean_nm := mean(cyl),
+ !!count_nm := n()
+ )
+
diff --git a/inst/doc/programming.Rmd b/inst/doc/programming.Rmd
new file mode 100644
index 0000000..43eadb4
--- /dev/null
+++ b/inst/doc/programming.Rmd
@@ -0,0 +1,587 @@
+---
+title: "Programming with dplyr"
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{Programming with dplyr}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\usepackage[utf8]{inputenc}
+---
+
+```{r setup, echo = FALSE, message = FALSE}
+knitr::opts_chunk$set(collapse = T, comment = "#>")
+options(tibble.print_min = 4L, tibble.print_max = 4L)
+library(dplyr)
+set.seed(1014)
+```
+
+Most dplyr functions use non-standard evaluation (NSE). This is a catch-all term that means they don't follow the usual R rules of evaluation. Instead, they capture the expression that you typed and evaluate it in a custom way. This has two main benefits for dplyr code:
+
+* Operations on data frames can be expressed succinctly because
+ you don't need to repeat the name of the data frame. For example,
+ you can write `filter(df, x == 1, y == 2, z == 3)` instead of
+ `df[df$x == 1 & df$y ==2 & df$z == 3, ]`.
+
+* dplyr can choose to compute results in a different way to base R.
+ This is important for database backends because dplyr itself doesn't
+ do any work, but instead generates the SQL that tells the database
+ what to do.
+
+Unfortunately these benefits do not come for free. There are two main drawbacks:
+
+* Most dplyr arguments are not __referentially transparent__. That means
+ you can't replace a value with a seemingly equivalent object that you've
+ defined elsewhere. In other words, this code:
+
+ ```{r}
+ df <- tibble(x = 1:3, y = 3:1)
+ filter(df, x == 1)
+ ```
+
+ Is not equivalent to this code:
+
+ ```{r, error = TRUE}
+ my_var <- x
+ filter(df, my_var == 1)
+ ```
+
+ nor to this code:
+
+ ```{r, error = TRUE}
+ my_var <- "x"
+ filter(df, my_var == 1)
+ ```
+
+ This makes it hard to create functions with arguments that change how
+ dplyr verbs are computed.
+
+* dplyr code is ambiguous. Depending on what variables are defined where,
+ `filter(df, x == y)` could be equivalent to any of:
+
+ ```{r, eval = FALSE}
+ df[df$x == df$y, ]
+ df[df$x == y, ]
+ df[x == df$y, ]
+ df[x == y, ]
+ ```
+
+ This is useful when working interactively (because it saves typing and you
+ quickly spot problems) but makes functions more unpredictable than you
+ might desire.
+
+Fortunately, dplyr provides tools to overcome these challenges. They require a little more typing, but a small amount of upfront work is worth it because they help you save time in the long run.
+
+This vignette has two goals:
+
+* Show you how to use dplyr's __pronouns__ and __quasiquotation__
+ to write reliable functions that reduce duplication in your data analysis
+ code.
+
+* To teach you the underlying theory including __quosures__, the data
+ structure that stores both an expression and an environment, and
+ __tidyeval__, the underlying toolkit.
+
+We'll start with a warmup, tying this problem to something you're more familiar with, then move on to some practical tools, then dive into the deeper theory.
+
+## Warm up
+
+You might not have realised it, but you're already accomplished at solving this type of problem in another domain: strings. It's obvious that this function doesn't do what you want:
+
+```{r}
+greet <- function(name) {
+ "How do you do, name?"
+}
+greet("Hadley")
+```
+
+That's because `"` "quotes" its input: it doesn't interpret what you've typed, it just stores it in a string. One way to make the function do what you want is to use `paste()` to build up the string piece by piece:
+
+```{r}
+greet <- function(name) {
+ paste0("How do you do, ", name, "?")
+}
+greet("Hadley")
+```
+
+Another approach is exemplified by the __glue__ package: it allows you to "unquote" components of a string, replacing the string with the value of the R expression. This allows an elegant implementation of our function because `{name}` is replaced with the value of the `name` argument.
+
+```{r}
+greet <- function(name) {
+ glue::glue("How do you do, {name}?")
+}
+greet("Hadley")
+```
+
+## Programming recipes
+
+The following recipes walk you through the basics of tidyeval, with the nominal goal of reducing duplication in dplyr code. The examples here are somewhat inauthentic because we've reduced them down to very simple components to make them easier to understand. They're so simple that you might wonder why we bother writing a function at all. But it's a good idea to learn the ideas on simple examples, so that you're better prepared to apply them to the more complex situations you'll see in y [...]
+
+### Different data sets
+
+You already know how to write functions that work with the first argument of dplyr verbs: the data. That's because dplyr doesn't do anything special with that argument, so it's referentially transparent. For example, if you saw repeated code like this:
+
+```{r, eval = FALSE}
+mutate(df1, y = a + x)
+mutate(df2, y = a + x)
+mutate(df3, y = a + x)
+mutate(df4, y = a + x)
+```
+
+You could already write a function to capture that duplication:
+
+```{r}
+mutate_y <- function(df) {
+ mutate(df, y = a + x)
+}
+```
+
+Unfortunately, there's a drawback to this simple approach: it can fail silently if one of the variables isn't present in the data frame, but is present in the global environment.
+
+```{r}
+df1 <- tibble(x = 1:3)
+a <- 10
+mutate_y(df1)
+```
+
+We can fix that ambiguity by being more explicit and using the `.data` pronoun. This will throw an informative error if the variable doesn't exist:
+
+```{r, error = TRUE}
+mutate_y <- function(df) {
+ mutate(df, y = .data$a + .data$x)
+}
+
+mutate_y(df1)
+```
+
+If this function is in a package, using `.data` also prevents `R CMD check` from giving a NOTE about undefined global variables (provided that you've also imported `rlang::.data` with `@importFrom rlang .data`).
+
+### Different expressions
+
+Writing a function is hard if you want one of the arguments to be a variable name (like `x`) or an expression (like `x + y`). That's because dplyr automatically "quotes" those inputs, so they are not referentially transparent. Let's start with a simple case: you want to vary the grouping variable for a data summarization.
+
+```{r}
+df <- tibble(
+ g1 = c(1, 1, 2, 2, 2),
+ g2 = c(1, 2, 1, 2, 1),
+ a = sample(5),
+ b = sample(5)
+)
+
+df %>%
+ group_by(g1) %>%
+ summarise(a = mean(a))
+
+df %>%
+ group_by(g2) %>%
+ summarise(a = mean(a))
+```
+
+You might hope that this will work:
+
+```{r, error = TRUE}
+my_summarise <- function(df, group_var) {
+ df %>%
+ group_by(group_var) %>%
+ summarise(a = mean(a))
+}
+
+my_summarise(df, g1)
+```
+
+But it doesn't.
+
+Maybe providing the variable name as a string will fix things?
+
+```{r, error = TRUE}
+my_summarise(df, "g2")
+```
+
+Nope.
+
+If you look carefully at the error message, you'll see that it's the same in both cases. `group_by()` works like `"`: it doesn't evaluate its input; it quotes it.
+
+To make this function work, we need to do two things. We need to quote the input ourselves (so `my_summarise()` can take a bare variable name like `group_by()`), and then we need to tell `group_by()` not to quote its input (because we've done the quoting).
+
+How do we quote the input? We can't use `""` to quote the input, because that gives us a string. Instead we need a function that captures the expression and its environment (we'll come back to why this is important later on). There are two possible options we could use in base R, the function `quote()` and the operator `~`. Neither of these work quite the way we want, so we need a new function: `quo()`.
+
+`quo()` works like `"`: it quotes its input rather than evaluating it.
+
+```{r}
+quo(g1)
+quo(a + b + c)
+quo("a")
+```
+
+`quo()` returns a __quosure__, which is a special type of formula. You'll learn more about quosures later on.
+
+Now that we've captured this expression, how do we use it with `group_by()`? It doesn't work if we just shove it into our naive approach:
+
+```{r, error = TRUE}
+my_summarise(df, quo(g1))
+```
+
+We get the same error as before, because we haven't yet told `group_by()` that we're taking care of the quoting. In other words, we need to tell `group_by()` not to quote its input, because it has been pre-quoted by `my_summarise()`. Yet another way of saying the same thing is that we want to __unquote__ `group_var`.
+
+In dplyr (and in tidyeval in general) you use `!!` to say that you want to unquote an input so that it's evaluated, not quoted. This gives us a function that actually does what we want.
+
+```{r}
+my_summarise <- function(df, group_var) {
+ df %>%
+ group_by(!!group_var) %>%
+ summarise(a = mean(a))
+}
+
+my_summarise(df, quo(g1))
+```
+
+Huzzah!
+
+There's just one step left: we want to call this function like we call `group_by()`:
+
+```{r, eval = FALSE}
+my_summarise(df, g1)
+```
+
+This doesn't work because there's no object called `g1`. We need to capture what the user of the function typed and quote it for them. You might try using `quo()` to do that:
+
+```{r, error = TRUE}
+my_summarise <- function(df, group_var) {
+ quo_group_var <- quo(group_var)
+ print(quo_group_var)
+
+ df %>%
+ group_by(!!quo_group_var) %>%
+ summarise(a = mean(a))
+}
+
+my_summarise(df, g1)
+```
+
+I've added a `print()` call to make it obvious what's going wrong here: `quo(group_var)` always returns `~group_var`. It is being too literal! We want it to substitute the value that the user supplied, i.e. to return `~g1`.
+
+By analogy to strings, we don't want `""`, instead we want some function that turns an argument into a string. That's the job of `enquo()`. `enquo()` uses some dark magic to look at the argument, see what the user typed, and return that value as a quosure. (Technically, this works because function arguments are evaluated lazily, using a special data structure called a __promise__.)
+
+```{r}
+my_summarise <- function(df, group_var) {
+ group_var <- enquo(group_var)
+ print(group_var)
+
+ df %>%
+ group_by(!!group_var) %>%
+ summarise(a = mean(a))
+}
+
+my_summarise(df, g1)
+```
+
+(If you're familiar with `quote()` and `substitute()` in base R, `quo()` is equivalent to `quote()` and `enquo()` is equivalent to `substitute()`.)
+
+You might wonder how to extend this to handle multiple grouping variables: we'll come back to that a little later.
+
+### Different input variable
+
+Now let's tackle something a bit more complicated. The code below shows a duplicate `summarise()` statement where we compute three summaries, varying the input variable.
+
+```{r}
+summarise(df, mean = mean(a), sum = sum(a), n = n())
+summarise(df, mean = mean(a * b), sum = sum(a * b), n = n())
+```
+
+To turn this into a function, we start by testing the basic approach interactively: we quote the variable with `quo()`, then unquoting it in the dplyr call with `!!`. Notice that we can unquote anywhere inside a complicated expression.
+
+```{r}
+my_var <- quo(a)
+summarise(df, mean = mean(!!my_var), sum = sum(!!my_var), n = n())
+```
+
+You can also wrap `quo()` around the dplyr call to see what will happen from dplyr's perspective. This is a very useful tool for debugging.
+
+```{r}
+quo(summarise(df,
+ mean = mean(!!my_var),
+ sum = sum(!!my_var),
+ n = n()
+))
+```
+
+Now we can turn our code into a function (remembering to replace `quo()` with `enquo()`), and check that it works:
+
+```{r}
+my_summarise2 <- function(df, expr) {
+ expr <- enquo(expr)
+
+ summarise(df,
+ mean = mean(!!expr),
+ sum = sum(!!expr),
+ n = n()
+ )
+}
+my_summarise2(df, a)
+my_summarise2(df, a * b)
+```
+
+### Different input and output variable
+
+The next challenge is to vary the name of the output variables:
+
+```{r}
+mutate(df, mean_a = mean(a), sum_a = sum(a))
+mutate(df, mean_b = mean(b), sum_b = sum(b))
+```
+
+This code is similar to the previous example, but there are two new wrinkles:
+
+* We create the new names by pasting together strings, so
+ we need `quo_name()` to convert the input expression to a string.
+
+* `!!mean_name = mean(!!expr)` isn't valid R code, so we need to
+ use the `:=` helper provided by rlang.
+
+```{r}
+my_mutate <- function(df, expr) {
+ expr <- enquo(expr)
+ mean_name <- paste0("mean_", quo_name(expr))
+ sum_name <- paste0("sum_", quo_name(expr))
+
+ mutate(df,
+ !!mean_name := mean(!!expr),
+ !!sum_name := sum(!!expr)
+ )
+}
+
+my_mutate(df, a)
+```
+
+### Capturing multiple variables
+
+It would be nice to extend `my_summarise()` to accept any number of grouping variables. We need to make three changes:
+
+* Use `...` in the function definition so our function can accept any number
+ of arguments.
+
+* Use `quos()` to capture all the `...` as a list of formulas.
+
+* Use `!!!` instead of `!!` to __splice__ the arguments into `group_by()`.
+
+```{r}
+my_summarise <- function(df, ...) {
+ group_var <- quos(...)
+
+ df %>%
+ group_by(!!!group_var) %>%
+ summarise(a = mean(a))
+}
+
+my_summarise(df, g1, g2)
+```
+
+`!!!` takes a list of elements and splices them into to the current call. Look at the bottom of the `!!!` and think `...`.
+
+```{r}
+args <- list(na.rm = TRUE, trim = 0.25)
+quo(mean(x, !!! args))
+
+args <- list(quo(x), na.rm = TRUE, trim = 0.25)
+quo(mean(!!! args))
+```
+
+Now that you've learned the basics of tidyeval through some practical examples, we'll dive into the theory. This will help you generalise what you've learned here to new situations.
+
+## Quoting
+
+Quoting is the action of capturing an expression instead of evaluating it. All expression-based functions quote their arguments and get the R code as an expression rather than the result of evaluating that code. If you are an R user, you probably quote expressions on a regular basis. One of the most important quoting operators in R is the _formula_. It is famously used for the specification of statistical models:
+
+```{r}
+disp ~ cyl + drat
+```
+
+The other quoting operator in base R is `quote()`. It returns a raw
+expression rather than a formula:
+
+```{r}
+# Computing the value of the expression:
+toupper(letters[1:5])
+
+# Capturing the expression:
+quote(toupper(letters[1:5]))
+```
+
+(Note that despite being called the double quote, `"` is not a quoting operator in this context, because it generates a string, not an expression.)
+
+In practice, the formula is the better of the two options because it captures the code and its execution __environment__. This is important because even simple expression can yield different values in different environments. For example, the `x` in the following two expressions refers to different values:
+
+```{r}
+f <- function(x) {
+ quo(x)
+}
+
+x1 <- f(10)
+x2 <- f(100)
+```
+
+It might look like the expressions are the same if you print them out.
+
+```{r}
+x1
+x2
+```
+
+But if you inspect the environments using `rlang::get_env()` --- they're different.
+```{r, message = FALSE}
+library(rlang)
+
+get_env(x1)
+get_env(x2)
+```
+
+Further, when we evaluate those formulas using `rlang::eval_tidy()`, we see that they yield different values:
+
+```{r}
+eval_tidy(x1)
+eval_tidy(x2)
+```
+
+This is a key property of R: one name can refer to different values in different environments. This is also important for dplyr, because it allows you to combine variables and objects in a call:
+
+```{r}
+user_var <- 1000
+mtcars %>% summarise(cyl = mean(cyl) * user_var)
+```
+
+When an object keeps track of an environment, it is said to have an enclosure. This is the reason that functions in R are sometimes referred to as closures:
+
+```{r}
+typeof(mean)
+```
+
+For this reason we use a special name to refer to one-sided formulas: __quosures__. One-sided formulas are quotes (they carry an expression) with an environment.
+
+Quosures are regular R objects. They can be stored in a variable and inspected:
+
+```{r}
+var <- ~toupper(letters[1:5])
+var
+
+# You can extract its expression:
+get_expr(var)
+
+# Or inspect its enclosure:
+get_env(var)
+```
+
+## Quasiquotation
+
+> Put simply, quasi-quotation enables one to introduce symbols that stand for
+> a linguistic expression in a given instance and are used as that linguistic
+> expression in a different instance.
+--- [Willard van Orman Quine](https://en.wikipedia.org/wiki/Quasi-quotation)
+
+Automatic quoting makes dplyr very convenient for interactive use. But if you want to program with dplyr, you need some way to refer to variables indirectly. The solution to this problem is __quasiquotation__, which allows you to evaluate directly inside an expression that is otherwise quoted.
+
+Quasiquotation was coined by Willard van Orman Quine in the 1940s, and was adopted for programming by the LISP community in the 1970s. All expression-based functions in the tidyeval framework support quasiquotation. Unquoting cancels quotation of parts of an expression. There are three types of unquoting:
+
+* basic
+* unquote splicing
+* unquoting names
+
+### Unquoting
+
+The first important operation is the basic unquote, which comes in a functional form, `UQ()`, and as syntactic-sugar, `!!`.
+
+```{r}
+# Here we capture `letters[1:5]` as an expression:
+quo(toupper(letters[1:5]))
+
+# Here we capture the value of `letters[1:5]`
+quo(toupper(!!letters[1:5]))
+quo(toupper(UQ(letters[1:5])))
+```
+
+It is also possible to unquote other quoted expressions. Unquoting such
+symbolic objects provides a powerful way of manipulating expressions.
+
+```{r}
+var1 <- quo(letters[1:5])
+quo(toupper(!!var1))
+```
+
+You can safely unquote quosures because they track their environments, and tidyeval functions know how to evaluate them. This allows any depth of quoting and unquoting.
+
+```{r}
+my_mutate <- function(x) {
+ mtcars %>%
+ select(cyl) %>%
+ slice(1:4) %>%
+ mutate(cyl2 = cyl + (!! x))
+}
+
+f <- function(x) quo(x)
+expr1 <- f(100)
+expr2 <- f(10)
+
+my_mutate(expr1)
+my_mutate(expr2)
+```
+
+The functional form is useful in cases where the precedence of `!` causes problems:
+
+```{r, error = TRUE}
+my_fun <- quo(fun)
+quo(!!my_fun(x, y, z))
+quo(UQ(my_fun)(x, y, z))
+
+my_var <- quo(x)
+quo(filter(df, !!my_var == 1))
+quo(filter(df, UQ(my_var) == 1))
+```
+
+You'll note above that `UQ()` yields a quosure containing a formula. That ensures that when the quosure is evaluated, it'll be looked up in the right environment. In certain code-generation scenarios you just want to use expression and ignore the environment. That's the job of `UQE()`:
+
+```{r}
+quo(UQE(my_fun)(x, y, z))
+quo(filter(df, UQE(my_var) == 1))
+```
+
+`UQE()` is for expert use only as you'll have to carefully analyse the environments to ensure that the generated code is correct.
+
+### Unquote-splicing
+
+The second unquote operation is unquote-splicing. Its functional form is `UQS()` and the syntactic shortcut is `!!!`. It takes a vector and inserts each element of the vector in the surrounding function call:
+
+```{r}
+quo(list(!!! letters[1:5]))
+```
+
+A very useful feature of unquote-splicing is that the vector names
+become argument names:
+
+```{r}
+x <- list(foo = 1L, bar = quo(baz))
+quo(list(!!! x))
+```
+
+This makes it easy to program with dplyr verbs that take named dots:
+
+```{r}
+args <- list(mean = quo(mean(cyl)), count = quo(n()))
+mtcars %>%
+ group_by(am) %>%
+ summarise(!!! args)
+```
+
+### Setting variable names
+
+The final unquote operation is setting argument names. You've seen one way to do that above, but you can also use the definition operator `:=` instead of `=`. `:=` supports unquoting on both the LHS and the RHS.
+
+The rules on the LHS are slightly different: the unquoted operand should evaluate to a string or a symbol.
+
+```{r}
+mean_nm <- "mean"
+count_nm <- "count"
+
+mtcars %>%
+ group_by(am) %>%
+ summarise(
+ !!mean_nm := mean(cyl),
+ !!count_nm := n()
+ )
+```
diff --git a/inst/doc/programming.html b/inst/doc/programming.html
new file mode 100644
index 0000000..e77ba4d
--- /dev/null
+++ b/inst/doc/programming.html
@@ -0,0 +1,613 @@
+<!DOCTYPE html>
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+
+<head>
+
+<meta charset="utf-8" />
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+<meta name="generator" content="pandoc" />
+
+<meta name="viewport" content="width=device-width, initial-scale=1">
+
+
+
+<title>Programming with dplyr</title>
+
+
+
+<style type="text/css">code{white-space: pre;}</style>
+<style type="text/css">
+div.sourceCode { overflow-x: auto; }
+table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode {
+ margin: 0; padding: 0; vertical-align: baseline; border: none; }
+table.sourceCode { width: 100%; line-height: 100%; }
+td.lineNumbers { text-align: right; padding-right: 4px; padding-left: 4px; color: #aaaaaa; border-right: 1px solid #aaaaaa; }
+td.sourceCode { padding-left: 5px; }
+code > span.kw { color: #007020; font-weight: bold; } /* Keyword */
+code > span.dt { color: #902000; } /* DataType */
+code > span.dv { color: #40a070; } /* DecVal */
+code > span.bn { color: #40a070; } /* BaseN */
+code > span.fl { color: #40a070; } /* Float */
+code > span.ch { color: #4070a0; } /* Char */
+code > span.st { color: #4070a0; } /* String */
+code > span.co { color: #60a0b0; font-style: italic; } /* Comment */
+code > span.ot { color: #007020; } /* Other */
+code > span.al { color: #ff0000; font-weight: bold; } /* Alert */
+code > span.fu { color: #06287e; } /* Function */
+code > span.er { color: #ff0000; font-weight: bold; } /* Error */
+code > span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
+code > span.cn { color: #880000; } /* Constant */
+code > span.sc { color: #4070a0; } /* SpecialChar */
+code > span.vs { color: #4070a0; } /* VerbatimString */
+code > span.ss { color: #bb6688; } /* SpecialString */
+code > span.im { } /* Import */
+code > span.va { color: #19177c; } /* Variable */
+code > span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
+code > span.op { color: #666666; } /* Operator */
+code > span.bu { } /* BuiltIn */
+code > span.ex { } /* Extension */
+code > span.pp { color: #bc7a00; } /* Preprocessor */
+code > span.at { color: #7d9029; } /* Attribute */
+code > span.do { color: #ba2121; font-style: italic; } /* Documentation */
+code > span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
+code > span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
+code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
+</style>
+
+
+
+<link href="data:text/css;charset=utf-8,body%20%7B%0Abackground%2Dcolor%3A%20%23fff%3B%0Amargin%3A%201em%20auto%3B%0Amax%2Dwidth%3A%20700px%3B%0Aoverflow%3A%20visible%3B%0Apadding%2Dleft%3A%202em%3B%0Apadding%2Dright%3A%202em%3B%0Afont%2Dfamily%3A%20%22Open%20Sans%22%2C%20%22Helvetica%20Neue%22%2C%20Helvetica%2C%20Arial%2C%20sans%2Dserif%3B%0Afont%2Dsize%3A%2014px%3B%0Aline%2Dheight%3A%201%2E35%3B%0A%7D%0A%23header%20%7B%0Atext%2Dalign%3A%20center%3B%0A%7D%0A%23TOC%20%7B%0Aclear%3A%20bot [...]
+
+</head>
+
+<body>
+
+
+
+
+<h1 class="title toc-ignore">Programming with dplyr</h1>
+
+
+
+<p>Most dplyr functions use non-standard evaluation (NSE). This is a catch-all term that means they don’t follow the usual R rules of evaluation. Instead, they capture the expression that you typed and evaluate it in a custom way. This has two main benefits for dplyr code:</p>
+<ul>
+<li><p>Operations on data frames can be expressed succinctly because you don’t need to repeat the name of the data frame. For example, you can write <code>filter(df, x == 1, y == 2, z == 3)</code> instead of <code>df[df$x == 1 & df$y ==2 & df$z == 3, ]</code>.</p></li>
+<li><p>dplyr can choose to compute results in a different way to base R. This is important for database backends because dplyr itself doesn’t do any work, but instead generates the SQL that tells the database what to do.</p></li>
+</ul>
+<p>Unfortunately these benefits do not come for free. There are two main drawbacks:</p>
+<ul>
+<li><p>Most dplyr arguments are not <strong>referentially transparent</strong>. That means you can’t replace a value with a seemingly equivalent object that you’ve defined elsewhere. In other words, this code:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df <-<span class="st"> </span><span class="kw">tibble</span>(<span class="dt">x =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">3</span>, <span class="dt">y =</span> <span class="dv">3</span><span class="op">:</span><span class="dv">1</span>)
+<span class="kw">filter</span>(df, x <span class="op">==</span><span class="st"> </span><span class="dv">1</span>)
+<span class="co">#> # A tibble: 1 x 2</span>
+<span class="co">#> x y</span>
+<span class="co">#> <int> <int></span>
+<span class="co">#> 1 1 3</span></code></pre></div>
+<p>Is not equivalent to this code:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">my_var <-<span class="st"> </span>x
+<span class="co">#> Error in eval(expr, envir, enclos): object 'x' not found</span>
+<span class="kw">filter</span>(df, my_var <span class="op">==</span><span class="st"> </span><span class="dv">1</span>)
+<span class="co">#> Error in filter_impl(.data, quo): Evaluation error: object 'my_var' not found.</span></code></pre></div>
+<p>nor to this code:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">my_var <-<span class="st"> "x"</span>
+<span class="kw">filter</span>(df, my_var <span class="op">==</span><span class="st"> </span><span class="dv">1</span>)
+<span class="co">#> # A tibble: 0 x 2</span>
+<span class="co">#> # ... with 2 variables: x <int>, y <int></span></code></pre></div>
+<p>This makes it hard to create functions with arguments that change how dplyr verbs are computed.</p></li>
+<li><p>dplyr code is ambiguous. Depending on what variables are defined where, <code>filter(df, x == y)</code> could be equivalent to any of:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df[df<span class="op">$</span>x <span class="op">==</span><span class="st"> </span>df<span class="op">$</span>y, ]
+df[df<span class="op">$</span>x <span class="op">==</span><span class="st"> </span>y, ]
+df[x <span class="op">==</span><span class="st"> </span>df<span class="op">$</span>y, ]
+df[x <span class="op">==</span><span class="st"> </span>y, ]</code></pre></div>
+<p>This is useful when working interactively (because it saves typing and you quickly spot problems) but makes functions more unpredictable than you might desire.</p></li>
+</ul>
+<p>Fortunately, dplyr provides tools to overcome these challenges. They require a little more typing, but a small amount of upfront work is worth it because they help you save time in the long run.</p>
+<p>This vignette has two goals:</p>
+<ul>
+<li><p>Show you how to use dplyr’s <strong>pronouns</strong> and <strong>quasiquotation</strong> to write reliable functions that reduce duplication in your data analysis code.</p></li>
+<li><p>To teach you the underlying theory including <strong>quosures</strong>, the data structure that stores both an expression and an environment, and <strong>tidyeval</strong>, the underlying toolkit.</p></li>
+</ul>
+<p>We’ll start with a warmup, tying this problem to something you’re more familiar with, then move on to some practical tools, then dive into the deeper theory.</p>
+<div id="warm-up" class="section level2">
+<h2>Warm up</h2>
+<p>You might not have realised it, but you’re already accomplished at solving this type of problem in another domain: strings. It’s obvious that this function doesn’t do what you want:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">greet <-<span class="st"> </span><span class="cf">function</span>(name) {
+ <span class="st">"How do you do, name?"</span>
+}
+<span class="kw">greet</span>(<span class="st">"Hadley"</span>)
+<span class="co">#> [1] "How do you do, name?"</span></code></pre></div>
+<p>That’s because <code>"</code> “quotes” its input: it doesn’t interpret what you’ve typed, it just stores it in a string. One way to make the function do what you want is to use <code>paste()</code> to build up the string piece by piece:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">greet <-<span class="st"> </span><span class="cf">function</span>(name) {
+ <span class="kw">paste0</span>(<span class="st">"How do you do, "</span>, name, <span class="st">"?"</span>)
+}
+<span class="kw">greet</span>(<span class="st">"Hadley"</span>)
+<span class="co">#> [1] "How do you do, Hadley?"</span></code></pre></div>
+<p>Another approach is exemplified by the <strong>glue</strong> package: it allows you to “unquote” components of a string, replacing the string with the value of the R expression. This allows an elegant implementation of our function because <code>{name}</code> is replaced with the value of the <code>name</code> argument.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">greet <-<span class="st"> </span><span class="cf">function</span>(name) {
+ glue<span class="op">::</span><span class="kw">glue</span>(<span class="st">"How do you do, {name}?"</span>)
+}
+<span class="kw">greet</span>(<span class="st">"Hadley"</span>)
+<span class="co">#> How do you do, Hadley?</span></code></pre></div>
+</div>
+<div id="programming-recipes" class="section level2">
+<h2>Programming recipes</h2>
+<p>The following recipes walk you through the basics of tidyeval, with the nominal goal of reducing duplication in dplyr code. The examples here are somewhat inauthentic because we’ve reduced them down to very simple components to make them easier to understand. They’re so simple that you might wonder why we bother writing a function at all. But it’s a good idea to learn the ideas on simple examples, so that you’re better prepared to apply them to the more complex situations you’ll see i [...]
+<div id="different-data-sets" class="section level3">
+<h3>Different data sets</h3>
+<p>You already know how to write functions that work with the first argument of dplyr verbs: the data. That’s because dplyr doesn’t do anything special with that argument, so it’s referentially transparent. For example, if you saw repeated code like this:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">mutate</span>(df1, <span class="dt">y =</span> a <span class="op">+</span><span class="st"> </span>x)
+<span class="kw">mutate</span>(df2, <span class="dt">y =</span> a <span class="op">+</span><span class="st"> </span>x)
+<span class="kw">mutate</span>(df3, <span class="dt">y =</span> a <span class="op">+</span><span class="st"> </span>x)
+<span class="kw">mutate</span>(df4, <span class="dt">y =</span> a <span class="op">+</span><span class="st"> </span>x)</code></pre></div>
+<p>You could already write a function to capture that duplication:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">mutate_y <-<span class="st"> </span><span class="cf">function</span>(df) {
+ <span class="kw">mutate</span>(df, <span class="dt">y =</span> a <span class="op">+</span><span class="st"> </span>x)
+}</code></pre></div>
+<p>Unfortunately, there’s a drawback to this simple approach: it can fail silently if one of the variables isn’t present in the data frame, but is present in the global environment.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df1 <-<span class="st"> </span><span class="kw">tibble</span>(<span class="dt">x =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">3</span>)
+a <-<span class="st"> </span><span class="dv">10</span>
+<span class="kw">mutate_y</span>(df1)
+<span class="co">#> # A tibble: 3 x 2</span>
+<span class="co">#> x y</span>
+<span class="co">#> <int> <dbl></span>
+<span class="co">#> 1 1 11.0</span>
+<span class="co">#> 2 2 12.0</span>
+<span class="co">#> 3 3 13.0</span></code></pre></div>
+<p>We can fix that ambiguity by being more explicit and using the <code>.data</code> pronoun. This will throw an informative error if the variable doesn’t exist:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">mutate_y <-<span class="st"> </span><span class="cf">function</span>(df) {
+ <span class="kw">mutate</span>(df, <span class="dt">y =</span> .data<span class="op">$</span>a <span class="op">+</span><span class="st"> </span>.data<span class="op">$</span>x)
+}
+
+<span class="kw">mutate_y</span>(df1)
+<span class="co">#> Error in mutate_impl(.data, dots): Evaluation error: Column `a`: not found in data.</span></code></pre></div>
+<p>If this function is in a package, using <code>.data</code> also prevents <code>R CMD check</code> from giving a NOTE about undefined global variables (provided that you’ve also imported <code>rlang::.data</code> with <code>@importFrom rlang .data</code>).</p>
+</div>
+<div id="different-expressions" class="section level3">
+<h3>Different expressions</h3>
+<p>Writing a function is hard if you want one of the arguments to be a variable name (like <code>x</code>) or an expression (like <code>x + y</code>). That’s because dplyr automatically “quotes” those inputs, so they are not referentially transparent. Let’s start with a simple case: you want to vary the grouping variable for a data summarization.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df <-<span class="st"> </span><span class="kw">tibble</span>(
+ <span class="dt">g1 =</span> <span class="kw">c</span>(<span class="dv">1</span>, <span class="dv">1</span>, <span class="dv">2</span>, <span class="dv">2</span>, <span class="dv">2</span>),
+ <span class="dt">g2 =</span> <span class="kw">c</span>(<span class="dv">1</span>, <span class="dv">2</span>, <span class="dv">1</span>, <span class="dv">2</span>, <span class="dv">1</span>),
+ <span class="dt">a =</span> <span class="kw">sample</span>(<span class="dv">5</span>),
+ <span class="dt">b =</span> <span class="kw">sample</span>(<span class="dv">5</span>)
+)
+
+df <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">group_by</span>(g1) <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">summarise</span>(<span class="dt">a =</span> <span class="kw">mean</span>(a))
+<span class="co">#> # A tibble: 2 x 2</span>
+<span class="co">#> g1 a</span>
+<span class="co">#> <dbl> <dbl></span>
+<span class="co">#> 1 1.00 2.50</span>
+<span class="co">#> 2 2.00 3.33</span>
+
+df <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">group_by</span>(g2) <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">summarise</span>(<span class="dt">a =</span> <span class="kw">mean</span>(a))
+<span class="co">#> # A tibble: 2 x 2</span>
+<span class="co">#> g2 a</span>
+<span class="co">#> <dbl> <dbl></span>
+<span class="co">#> 1 1.00 2.00</span>
+<span class="co">#> 2 2.00 4.50</span></code></pre></div>
+<p>You might hope that this will work:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">my_summarise <-<span class="st"> </span><span class="cf">function</span>(df, group_var) {
+ df <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">group_by</span>(group_var) <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">summarise</span>(<span class="dt">a =</span> <span class="kw">mean</span>(a))
+}
+
+<span class="kw">my_summarise</span>(df, g1)
+<span class="co">#> Error in grouped_df_impl(data, unname(vars), drop): Column `group_var` is unknown</span></code></pre></div>
+<p>But it doesn’t.</p>
+<p>Maybe providing the variable name as a string will fix things?</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">my_summarise</span>(df, <span class="st">"g2"</span>)
+<span class="co">#> Error in grouped_df_impl(data, unname(vars), drop): Column `group_var` is unknown</span></code></pre></div>
+<p>Nope.</p>
+<p>If you look carefully at the error message, you’ll see that it’s the same in both cases. <code>group_by()</code> works like <code>"</code>: it doesn’t evaluate its input; it quotes it.</p>
+<p>To make this function work, we need to do two things. We need to quote the input ourselves (so <code>my_summarise()</code> can take a bare variable name like <code>group_by()</code>), and then we need to tell <code>group_by()</code> not to quote its input (because we’ve done the quoting).</p>
+<p>How do we quote the input? We can’t use <code>""</code> to quote the input, because that gives us a string. Instead we need a function that captures the expression and its environment (we’ll come back to why this is important later on). There are two possible options we could use in base R, the function <code>quote()</code> and the operator <code>~</code>. Neither of these work quite the way we want, so we need a new function: <code>quo()</code>.</p>
+<p><code>quo()</code> works like <code>"</code>: it quotes its input rather than evaluating it.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">quo</span>(g1)
+<span class="co">#> <quosure: global></span>
+<span class="co">#> ~g1</span>
+<span class="kw">quo</span>(a <span class="op">+</span><span class="st"> </span>b <span class="op">+</span><span class="st"> </span>c)
+<span class="co">#> <quosure: global></span>
+<span class="co">#> ~a + b + c</span>
+<span class="kw">quo</span>(<span class="st">"a"</span>)
+<span class="co">#> <quosure: empty></span>
+<span class="co">#> ~"a"</span></code></pre></div>
+<p><code>quo()</code> returns a <strong>quosure</strong>, which is a special type of formula. You’ll learn more about quosures later on.</p>
+<p>Now that we’ve captured this expression, how do we use it with <code>group_by()</code>? It doesn’t work if we just shove it into our naive approach:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">my_summarise</span>(df, <span class="kw">quo</span>(g1))
+<span class="co">#> Error in grouped_df_impl(data, unname(vars), drop): Column `group_var` is unknown</span></code></pre></div>
+<p>We get the same error as before, because we haven’t yet told <code>group_by()</code> that we’re taking care of the quoting. In other words, we need to tell <code>group_by()</code> not to quote its input, because it has been pre-quoted by <code>my_summarise()</code>. Yet another way of saying the same thing is that we want to <strong>unquote</strong> <code>group_var</code>.</p>
+<p>In dplyr (and in tidyeval in general) you use <code>!!</code> to say that you want to unquote an input so that it’s evaluated, not quoted. This gives us a function that actually does what we want.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">my_summarise <-<span class="st"> </span><span class="cf">function</span>(df, group_var) {
+ df <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">group_by</span>(<span class="op">!!</span>group_var) <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">summarise</span>(<span class="dt">a =</span> <span class="kw">mean</span>(a))
+}
+
+<span class="kw">my_summarise</span>(df, <span class="kw">quo</span>(g1))
+<span class="co">#> # A tibble: 2 x 2</span>
+<span class="co">#> g1 a</span>
+<span class="co">#> <dbl> <dbl></span>
+<span class="co">#> 1 1.00 2.50</span>
+<span class="co">#> 2 2.00 3.33</span></code></pre></div>
+<p>Huzzah!</p>
+<p>There’s just one step left: we want to call this function like we call <code>group_by()</code>:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">my_summarise</span>(df, g1)</code></pre></div>
+<p>This doesn’t work because there’s no object called <code>g1</code>. We need to capture what the user of the function typed and quote it for them. You might try using <code>quo()</code> to do that:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">my_summarise <-<span class="st"> </span><span class="cf">function</span>(df, group_var) {
+ quo_group_var <-<span class="st"> </span><span class="kw">quo</span>(group_var)
+ <span class="kw">print</span>(quo_group_var)
+
+ df <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">group_by</span>(<span class="op">!!</span>quo_group_var) <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">summarise</span>(<span class="dt">a =</span> <span class="kw">mean</span>(a))
+}
+
+<span class="kw">my_summarise</span>(df, g1)
+<span class="co">#> <quosure: frame></span>
+<span class="co">#> ~group_var</span>
+<span class="co">#> Error in grouped_df_impl(data, unname(vars), drop): Column `group_var` is unknown</span></code></pre></div>
+<p>I’ve added a <code>print()</code> call to make it obvious what’s going wrong here: <code>quo(group_var)</code> always returns <code>~group_var</code>. It is being too literal! We want it to substitute the value that the user supplied, i.e. to return <code>~g1</code>.</p>
+<p>By analogy to strings, we don’t want <code>""</code>, instead we want some function that turns an argument into a string. That’s the job of <code>enquo()</code>. <code>enquo()</code> uses some dark magic to look at the argument, see what the user typed, and return that value as a quosure. (Technically, this works because function arguments are evaluated lazily, using a special data structure called a <strong>promise</strong>.)</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">my_summarise <-<span class="st"> </span><span class="cf">function</span>(df, group_var) {
+ group_var <-<span class="st"> </span><span class="kw">enquo</span>(group_var)
+ <span class="kw">print</span>(group_var)
+
+ df <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">group_by</span>(<span class="op">!!</span>group_var) <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">summarise</span>(<span class="dt">a =</span> <span class="kw">mean</span>(a))
+}
+
+<span class="kw">my_summarise</span>(df, g1)
+<span class="co">#> <quosure: global></span>
+<span class="co">#> ~g1</span>
+<span class="co">#> # A tibble: 2 x 2</span>
+<span class="co">#> g1 a</span>
+<span class="co">#> <dbl> <dbl></span>
+<span class="co">#> 1 1.00 2.50</span>
+<span class="co">#> 2 2.00 3.33</span></code></pre></div>
+<p>(If you’re familiar with <code>quote()</code> and <code>substitute()</code> in base R, <code>quo()</code> is equivalent to <code>quote()</code> and <code>enquo()</code> is equivalent to <code>substitute()</code>.)</p>
+<p>You might wonder how to extend this to handle multiple grouping variables: we’ll come back to that a little later.</p>
+</div>
+<div id="different-input-variable" class="section level3">
+<h3>Different input variable</h3>
+<p>Now let’s tackle something a bit more complicated. The code below shows a duplicate <code>summarise()</code> statement where we compute three summaries, varying the input variable.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">summarise</span>(df, <span class="dt">mean =</span> <span class="kw">mean</span>(a), <span class="dt">sum =</span> <span class="kw">sum</span>(a), <span class="dt">n =</span> <span class="kw">n</span>())
+<span class="co">#> # A tibble: 1 x 3</span>
+<span class="co">#> mean sum n</span>
+<span class="co">#> <dbl> <int> <int></span>
+<span class="co">#> 1 3.00 15 5</span>
+<span class="kw">summarise</span>(df, <span class="dt">mean =</span> <span class="kw">mean</span>(a <span class="op">*</span><span class="st"> </span>b), <span class="dt">sum =</span> <span class="kw">sum</span>(a <span class="op">*</span><span class="st"> </span>b), <span class="dt">n =</span> <span class="kw">n</span>())
+<span class="co">#> # A tibble: 1 x 3</span>
+<span class="co">#> mean sum n</span>
+<span class="co">#> <dbl> <int> <int></span>
+<span class="co">#> 1 9.60 48 5</span></code></pre></div>
+<p>To turn this into a function, we start by testing the basic approach interactively: we quote the variable with <code>quo()</code>, then unquoting it in the dplyr call with <code>!!</code>. Notice that we can unquote anywhere inside a complicated expression.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">my_var <-<span class="st"> </span><span class="kw">quo</span>(a)
+<span class="kw">summarise</span>(df, <span class="dt">mean =</span> <span class="kw">mean</span>(<span class="op">!!</span>my_var), <span class="dt">sum =</span> <span class="kw">sum</span>(<span class="op">!!</span>my_var), <span class="dt">n =</span> <span class="kw">n</span>())
+<span class="co">#> # A tibble: 1 x 3</span>
+<span class="co">#> mean sum n</span>
+<span class="co">#> <dbl> <int> <int></span>
+<span class="co">#> 1 3.00 15 5</span></code></pre></div>
+<p>You can also wrap <code>quo()</code> around the dplyr call to see what will happen from dplyr’s perspective. This is a very useful tool for debugging.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">quo</span>(<span class="kw">summarise</span>(df,
+ <span class="dt">mean =</span> <span class="kw">mean</span>(<span class="op">!!</span>my_var),
+ <span class="dt">sum =</span> <span class="kw">sum</span>(<span class="op">!!</span>my_var),
+ <span class="dt">n =</span> <span class="kw">n</span>()
+))
+<span class="co">#> <quosure: global></span>
+<span class="co">#> ~summarise(df, mean = mean(~a), sum = sum(~a), n = n())</span></code></pre></div>
+<p>Now we can turn our code into a function (remembering to replace <code>quo()</code> with <code>enquo()</code>), and check that it works:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">my_summarise2 <-<span class="st"> </span><span class="cf">function</span>(df, expr) {
+ expr <-<span class="st"> </span><span class="kw">enquo</span>(expr)
+
+ <span class="kw">summarise</span>(df,
+ <span class="dt">mean =</span> <span class="kw">mean</span>(<span class="op">!!</span>expr),
+ <span class="dt">sum =</span> <span class="kw">sum</span>(<span class="op">!!</span>expr),
+ <span class="dt">n =</span> <span class="kw">n</span>()
+ )
+}
+<span class="kw">my_summarise2</span>(df, a)
+<span class="co">#> # A tibble: 1 x 3</span>
+<span class="co">#> mean sum n</span>
+<span class="co">#> <dbl> <int> <int></span>
+<span class="co">#> 1 3.00 15 5</span>
+<span class="kw">my_summarise2</span>(df, a <span class="op">*</span><span class="st"> </span>b)
+<span class="co">#> # A tibble: 1 x 3</span>
+<span class="co">#> mean sum n</span>
+<span class="co">#> <dbl> <int> <int></span>
+<span class="co">#> 1 9.60 48 5</span></code></pre></div>
+</div>
+<div id="different-input-and-output-variable" class="section level3">
+<h3>Different input and output variable</h3>
+<p>The next challenge is to vary the name of the output variables:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">mutate</span>(df, <span class="dt">mean_a =</span> <span class="kw">mean</span>(a), <span class="dt">sum_a =</span> <span class="kw">sum</span>(a))
+<span class="co">#> # A tibble: 5 x 6</span>
+<span class="co">#> g1 g2 a b mean_a sum_a</span>
+<span class="co">#> <dbl> <dbl> <int> <int> <dbl> <int></span>
+<span class="co">#> 1 1.00 1.00 1 3 3.00 15</span>
+<span class="co">#> 2 1.00 2.00 4 2 3.00 15</span>
+<span class="co">#> 3 2.00 1.00 2 1 3.00 15</span>
+<span class="co">#> 4 2.00 2.00 5 4 3.00 15</span>
+<span class="co">#> # ... with 1 more row</span>
+<span class="kw">mutate</span>(df, <span class="dt">mean_b =</span> <span class="kw">mean</span>(b), <span class="dt">sum_b =</span> <span class="kw">sum</span>(b))
+<span class="co">#> # A tibble: 5 x 6</span>
+<span class="co">#> g1 g2 a b mean_b sum_b</span>
+<span class="co">#> <dbl> <dbl> <int> <int> <dbl> <int></span>
+<span class="co">#> 1 1.00 1.00 1 3 3.00 15</span>
+<span class="co">#> 2 1.00 2.00 4 2 3.00 15</span>
+<span class="co">#> 3 2.00 1.00 2 1 3.00 15</span>
+<span class="co">#> 4 2.00 2.00 5 4 3.00 15</span>
+<span class="co">#> # ... with 1 more row</span></code></pre></div>
+<p>This code is similar to the previous example, but there are two new wrinkles:</p>
+<ul>
+<li><p>We create the new names by pasting together strings, so we need <code>quo_name()</code> to convert the input expression to a string.</p></li>
+<li><p><code>!!mean_name = mean(!!expr)</code> isn’t valid R code, so we need to use the <code>:=</code> helper provided by rlang.</p></li>
+</ul>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">my_mutate <-<span class="st"> </span><span class="cf">function</span>(df, expr) {
+ expr <-<span class="st"> </span><span class="kw">enquo</span>(expr)
+ mean_name <-<span class="st"> </span><span class="kw">paste0</span>(<span class="st">"mean_"</span>, <span class="kw">quo_name</span>(expr))
+ sum_name <-<span class="st"> </span><span class="kw">paste0</span>(<span class="st">"sum_"</span>, <span class="kw">quo_name</span>(expr))
+
+ <span class="kw">mutate</span>(df,
+ <span class="op">!!</span>mean_name <span class="op">:</span><span class="er">=</span><span class="st"> </span><span class="kw">mean</span>(<span class="op">!!</span>expr),
+ <span class="op">!!</span>sum_name <span class="op">:</span><span class="er">=</span><span class="st"> </span><span class="kw">sum</span>(<span class="op">!!</span>expr)
+ )
+}
+
+<span class="kw">my_mutate</span>(df, a)
+<span class="co">#> # A tibble: 5 x 6</span>
+<span class="co">#> g1 g2 a b mean_a sum_a</span>
+<span class="co">#> <dbl> <dbl> <int> <int> <dbl> <int></span>
+<span class="co">#> 1 1.00 1.00 1 3 3.00 15</span>
+<span class="co">#> 2 1.00 2.00 4 2 3.00 15</span>
+<span class="co">#> 3 2.00 1.00 2 1 3.00 15</span>
+<span class="co">#> 4 2.00 2.00 5 4 3.00 15</span>
+<span class="co">#> # ... with 1 more row</span></code></pre></div>
+</div>
+<div id="capturing-multiple-variables" class="section level3">
+<h3>Capturing multiple variables</h3>
+<p>It would be nice to extend <code>my_summarise()</code> to accept any number of grouping variables. We need to make three changes:</p>
+<ul>
+<li><p>Use <code>...</code> in the function definition so our function can accept any number of arguments.</p></li>
+<li><p>Use <code>quos()</code> to capture all the <code>...</code> as a list of formulas.</p></li>
+<li><p>Use <code>!!!</code> instead of <code>!!</code> to <strong>splice</strong> the arguments into <code>group_by()</code>.</p></li>
+</ul>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">my_summarise <-<span class="st"> </span><span class="cf">function</span>(df, ...) {
+ group_var <-<span class="st"> </span><span class="kw">quos</span>(...)
+
+ df <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">group_by</span>(<span class="op">!!!</span>group_var) <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">summarise</span>(<span class="dt">a =</span> <span class="kw">mean</span>(a))
+}
+
+<span class="kw">my_summarise</span>(df, g1, g2)
+<span class="co">#> # A tibble: 4 x 3</span>
+<span class="co">#> # Groups: g1 [?]</span>
+<span class="co">#> g1 g2 a</span>
+<span class="co">#> <dbl> <dbl> <dbl></span>
+<span class="co">#> 1 1.00 1.00 1.00</span>
+<span class="co">#> 2 1.00 2.00 4.00</span>
+<span class="co">#> 3 2.00 1.00 2.50</span>
+<span class="co">#> 4 2.00 2.00 5.00</span></code></pre></div>
+<p><code>!!!</code> takes a list of elements and splices them into to the current call. Look at the bottom of the <code>!!!</code> and think <code>...</code>.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">args <-<span class="st"> </span><span class="kw">list</span>(<span class="dt">na.rm =</span> <span class="ot">TRUE</span>, <span class="dt">trim =</span> <span class="fl">0.25</span>)
+<span class="kw">quo</span>(<span class="kw">mean</span>(x, <span class="op">!!!</span><span class="st"> </span>args))
+<span class="co">#> <quosure: global></span>
+<span class="co">#> ~mean(x, na.rm = TRUE, trim = 0.25)</span>
+
+args <-<span class="st"> </span><span class="kw">list</span>(<span class="kw">quo</span>(x), <span class="dt">na.rm =</span> <span class="ot">TRUE</span>, <span class="dt">trim =</span> <span class="fl">0.25</span>)
+<span class="kw">quo</span>(<span class="kw">mean</span>(<span class="op">!!!</span><span class="st"> </span>args))
+<span class="co">#> <quosure: global></span>
+<span class="co">#> ~mean(~x, na.rm = TRUE, trim = 0.25)</span></code></pre></div>
+<p>Now that you’ve learned the basics of tidyeval through some practical examples, we’ll dive into the theory. This will help you generalise what you’ve learned here to new situations.</p>
+</div>
+</div>
+<div id="quoting" class="section level2">
+<h2>Quoting</h2>
+<p>Quoting is the action of capturing an expression instead of evaluating it. All expression-based functions quote their arguments and get the R code as an expression rather than the result of evaluating that code. If you are an R user, you probably quote expressions on a regular basis. One of the most important quoting operators in R is the <em>formula</em>. It is famously used for the specification of statistical models:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">disp <span class="op">~</span><span class="st"> </span>cyl <span class="op">+</span><span class="st"> </span>drat
+<span class="co">#> disp ~ cyl + drat</span></code></pre></div>
+<p>The other quoting operator in base R is <code>quote()</code>. It returns a raw expression rather than a formula:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Computing the value of the expression:</span>
+<span class="kw">toupper</span>(letters[<span class="dv">1</span><span class="op">:</span><span class="dv">5</span>])
+<span class="co">#> [1] "A" "B" "C" "D" "E"</span>
+
+<span class="co"># Capturing the expression:</span>
+<span class="kw">quote</span>(<span class="kw">toupper</span>(letters[<span class="dv">1</span><span class="op">:</span><span class="dv">5</span>]))
+<span class="co">#> toupper(letters[1:5])</span></code></pre></div>
+<p>(Note that despite being called the double quote, <code>"</code> is not a quoting operator in this context, because it generates a string, not an expression.)</p>
+<p>In practice, the formula is the better of the two options because it captures the code and its execution <strong>environment</strong>. This is important because even simple expression can yield different values in different environments. For example, the <code>x</code> in the following two expressions refers to different values:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">f <-<span class="st"> </span><span class="cf">function</span>(x) {
+ <span class="kw">quo</span>(x)
+}
+
+x1 <-<span class="st"> </span><span class="kw">f</span>(<span class="dv">10</span>)
+x2 <-<span class="st"> </span><span class="kw">f</span>(<span class="dv">100</span>)</code></pre></div>
+<p>It might look like the expressions are the same if you print them out.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">x1
+<span class="co">#> <quosure: local></span>
+<span class="co">#> ~x</span>
+x2
+<span class="co">#> <quosure: local></span>
+<span class="co">#> ~x</span></code></pre></div>
+<p>But if you inspect the environments using <code>rlang::get_env()</code> — they’re different.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(rlang)
+
+<span class="kw">get_env</span>(x1)
+<span class="co">#> <environment: 0x557ddf5b1ab0></span>
+<span class="kw">get_env</span>(x2)
+<span class="co">#> <environment: 0x557ddfa6e6c8></span></code></pre></div>
+<p>Further, when we evaluate those formulas using <code>rlang::eval_tidy()</code>, we see that they yield different values:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">eval_tidy</span>(x1)
+<span class="co">#> [1] 10</span>
+<span class="kw">eval_tidy</span>(x2)
+<span class="co">#> [1] 100</span></code></pre></div>
+<p>This is a key property of R: one name can refer to different values in different environments. This is also important for dplyr, because it allows you to combine variables and objects in a call:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">user_var <-<span class="st"> </span><span class="dv">1000</span>
+mtcars <span class="op">%>%</span><span class="st"> </span><span class="kw">summarise</span>(<span class="dt">cyl =</span> <span class="kw">mean</span>(cyl) <span class="op">*</span><span class="st"> </span>user_var)
+<span class="co">#> cyl</span>
+<span class="co">#> 1 6187.5</span></code></pre></div>
+<p>When an object keeps track of an environment, it is said to have an enclosure. This is the reason that functions in R are sometimes referred to as closures:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">typeof</span>(mean)
+<span class="co">#> [1] "closure"</span></code></pre></div>
+<p>For this reason we use a special name to refer to one-sided formulas: <strong>quosures</strong>. One-sided formulas are quotes (they carry an expression) with an environment.</p>
+<p>Quosures are regular R objects. They can be stored in a variable and inspected:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">var <-<span class="st"> </span><span class="er">~</span><span class="kw">toupper</span>(letters[<span class="dv">1</span><span class="op">:</span><span class="dv">5</span>])
+var
+<span class="co">#> ~toupper(letters[1:5])</span>
+
+<span class="co"># You can extract its expression:</span>
+<span class="kw">get_expr</span>(var)
+<span class="co">#> toupper(letters[1:5])</span>
+
+<span class="co"># Or inspect its enclosure:</span>
+<span class="kw">get_env</span>(var)
+<span class="co">#> <environment: R_GlobalEnv></span></code></pre></div>
+</div>
+<div id="quasiquotation" class="section level2">
+<h2>Quasiquotation</h2>
+<blockquote>
+<p>Put simply, quasi-quotation enables one to introduce symbols that stand for a linguistic expression in a given instance and are used as that linguistic expression in a different instance. — <a href="https://en.wikipedia.org/wiki/Quasi-quotation">Willard van Orman Quine</a></p>
+</blockquote>
+<p>Automatic quoting makes dplyr very convenient for interactive use. But if you want to program with dplyr, you need some way to refer to variables indirectly. The solution to this problem is <strong>quasiquotation</strong>, which allows you to evaluate directly inside an expression that is otherwise quoted.</p>
+<p>Quasiquotation was coined by Willard van Orman Quine in the 1940s, and was adopted for programming by the LISP community in the 1970s. All expression-based functions in the tidyeval framework support quasiquotation. Unquoting cancels quotation of parts of an expression. There are three types of unquoting:</p>
+<ul>
+<li>basic</li>
+<li>unquote splicing</li>
+<li>unquoting names</li>
+</ul>
+<div id="unquoting" class="section level3">
+<h3>Unquoting</h3>
+<p>The first important operation is the basic unquote, which comes in a functional form, <code>UQ()</code>, and as syntactic-sugar, <code>!!</code>.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Here we capture `letters[1:5]` as an expression:</span>
+<span class="kw">quo</span>(<span class="kw">toupper</span>(letters[<span class="dv">1</span><span class="op">:</span><span class="dv">5</span>]))
+<span class="co">#> <quosure: global></span>
+<span class="co">#> ~toupper(letters[1:5])</span>
+
+<span class="co"># Here we capture the value of `letters[1:5]`</span>
+<span class="kw">quo</span>(<span class="kw">toupper</span>(<span class="op">!!</span>letters[<span class="dv">1</span><span class="op">:</span><span class="dv">5</span>]))
+<span class="co">#> <quosure: global></span>
+<span class="co">#> ~toupper(c("a", "b", "c", "d", "e"))</span>
+<span class="kw">quo</span>(<span class="kw">toupper</span>(<span class="kw">UQ</span>(letters[<span class="dv">1</span><span class="op">:</span><span class="dv">5</span>])))
+<span class="co">#> <quosure: global></span>
+<span class="co">#> ~toupper(c("a", "b", "c", "d", "e"))</span></code></pre></div>
+<p>It is also possible to unquote other quoted expressions. Unquoting such symbolic objects provides a powerful way of manipulating expressions.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">var1 <-<span class="st"> </span><span class="kw">quo</span>(letters[<span class="dv">1</span><span class="op">:</span><span class="dv">5</span>])
+<span class="kw">quo</span>(<span class="kw">toupper</span>(<span class="op">!!</span>var1))
+<span class="co">#> <quosure: global></span>
+<span class="co">#> ~toupper(~letters[1:5])</span></code></pre></div>
+<p>You can safely unquote quosures because they track their environments, and tidyeval functions know how to evaluate them. This allows any depth of quoting and unquoting.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">my_mutate <-<span class="st"> </span><span class="cf">function</span>(x) {
+ mtcars <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">select</span>(cyl) <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">slice</span>(<span class="dv">1</span><span class="op">:</span><span class="dv">4</span>) <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">mutate</span>(<span class="dt">cyl2 =</span> cyl <span class="op">+</span><span class="st"> </span>(<span class="op">!!</span><span class="st"> </span>x))
+}
+
+f <-<span class="st"> </span><span class="cf">function</span>(x) <span class="kw">quo</span>(x)
+expr1 <-<span class="st"> </span><span class="kw">f</span>(<span class="dv">100</span>)
+expr2 <-<span class="st"> </span><span class="kw">f</span>(<span class="dv">10</span>)
+
+<span class="kw">my_mutate</span>(expr1)
+<span class="co">#> # A tibble: 4 x 2</span>
+<span class="co">#> cyl cyl2</span>
+<span class="co">#> <dbl> <dbl></span>
+<span class="co">#> 1 6.00 106</span>
+<span class="co">#> 2 6.00 106</span>
+<span class="co">#> 3 4.00 104</span>
+<span class="co">#> 4 6.00 106</span>
+<span class="kw">my_mutate</span>(expr2)
+<span class="co">#> # A tibble: 4 x 2</span>
+<span class="co">#> cyl cyl2</span>
+<span class="co">#> <dbl> <dbl></span>
+<span class="co">#> 1 6.00 16.0</span>
+<span class="co">#> 2 6.00 16.0</span>
+<span class="co">#> 3 4.00 14.0</span>
+<span class="co">#> 4 6.00 16.0</span></code></pre></div>
+<p>The functional form is useful in cases where the precedence of <code>!</code> causes problems:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">my_fun <-<span class="st"> </span><span class="kw">quo</span>(fun)
+<span class="kw">quo</span>(<span class="op">!!</span><span class="kw">my_fun</span>(x, y, z))
+<span class="co">#> Error in my_fun(x, y, z): could not find function "my_fun"</span>
+<span class="kw">quo</span>(<span class="kw">UQ</span>(my_fun)(x, y, z))
+<span class="co">#> <quosure: global></span>
+<span class="co">#> ~(~fun)(x, y, z)</span>
+
+my_var <-<span class="st"> </span><span class="kw">quo</span>(x)
+<span class="kw">quo</span>(<span class="kw">filter</span>(df, <span class="op">!!</span>my_var <span class="op">==</span><span class="st"> </span><span class="dv">1</span>))
+<span class="co">#> <quosure: global></span>
+<span class="co">#> ~filter(df, FALSE)</span>
+<span class="kw">quo</span>(<span class="kw">filter</span>(df, <span class="kw">UQ</span>(my_var) <span class="op">==</span><span class="st"> </span><span class="dv">1</span>))
+<span class="co">#> <quosure: global></span>
+<span class="co">#> ~filter(df, (~x) == 1)</span></code></pre></div>
+<p>You’ll note above that <code>UQ()</code> yields a quosure containing a formula. That ensures that when the quosure is evaluated, it’ll be looked up in the right environment. In certain code-generation scenarios you just want to use expression and ignore the environment. That’s the job of <code>UQE()</code>:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">quo</span>(<span class="kw">UQE</span>(my_fun)(x, y, z))
+<span class="co">#> <quosure: global></span>
+<span class="co">#> ~fun(x, y, z)</span>
+<span class="kw">quo</span>(<span class="kw">filter</span>(df, <span class="kw">UQE</span>(my_var) <span class="op">==</span><span class="st"> </span><span class="dv">1</span>))
+<span class="co">#> <quosure: global></span>
+<span class="co">#> ~filter(df, x == 1)</span></code></pre></div>
+<p><code>UQE()</code> is for expert use only as you’ll have to carefully analyse the environments to ensure that the generated code is correct.</p>
+</div>
+<div id="unquote-splicing" class="section level3">
+<h3>Unquote-splicing</h3>
+<p>The second unquote operation is unquote-splicing. Its functional form is <code>UQS()</code> and the syntactic shortcut is <code>!!!</code>. It takes a vector and inserts each element of the vector in the surrounding function call:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">quo</span>(<span class="kw">list</span>(<span class="op">!!!</span><span class="st"> </span>letters[<span class="dv">1</span><span class="op">:</span><span class="dv">5</span>]))
+<span class="co">#> <quosure: global></span>
+<span class="co">#> ~list("a", "b", "c", "d", "e")</span></code></pre></div>
+<p>A very useful feature of unquote-splicing is that the vector names become argument names:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">x <-<span class="st"> </span><span class="kw">list</span>(<span class="dt">foo =</span> 1L, <span class="dt">bar =</span> <span class="kw">quo</span>(baz))
+<span class="kw">quo</span>(<span class="kw">list</span>(<span class="op">!!!</span><span class="st"> </span>x))
+<span class="co">#> <quosure: global></span>
+<span class="co">#> ~list(foo = 1L, bar = ~baz)</span></code></pre></div>
+<p>This makes it easy to program with dplyr verbs that take named dots:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">args <-<span class="st"> </span><span class="kw">list</span>(<span class="dt">mean =</span> <span class="kw">quo</span>(<span class="kw">mean</span>(cyl)), <span class="dt">count =</span> <span class="kw">quo</span>(<span class="kw">n</span>()))
+mtcars <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">group_by</span>(am) <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">summarise</span>(<span class="op">!!!</span><span class="st"> </span>args)
+<span class="co">#> # A tibble: 2 x 3</span>
+<span class="co">#> am mean count</span>
+<span class="co">#> <dbl> <dbl> <int></span>
+<span class="co">#> 1 0 6.95 19</span>
+<span class="co">#> 2 1.00 5.08 13</span></code></pre></div>
+</div>
+<div id="setting-variable-names" class="section level3">
+<h3>Setting variable names</h3>
+<p>The final unquote operation is setting argument names. You’ve seen one way to do that above, but you can also use the definition operator <code>:=</code> instead of <code>=</code>. <code>:=</code> supports unquoting on both the LHS and the RHS.</p>
+<p>The rules on the LHS are slightly different: the unquoted operand should evaluate to a string or a symbol.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">mean_nm <-<span class="st"> "mean"</span>
+count_nm <-<span class="st"> "count"</span>
+
+mtcars <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">group_by</span>(am) <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">summarise</span>(
+ <span class="op">!!</span>mean_nm <span class="op">:</span><span class="er">=</span><span class="st"> </span><span class="kw">mean</span>(cyl),
+ <span class="op">!!</span>count_nm <span class="op">:</span><span class="er">=</span><span class="st"> </span><span class="kw">n</span>()
+ )
+<span class="co">#> # A tibble: 2 x 3</span>
+<span class="co">#> am mean count</span>
+<span class="co">#> <dbl> <dbl> <int></span>
+<span class="co">#> 1 0 6.95 19</span>
+<span class="co">#> 2 1.00 5.08 13</span></code></pre></div>
+</div>
+</div>
+
+
+
+<!-- dynamically load mathjax for compatibility with self-contained -->
+<script>
+ (function () {
+ var script = document.createElement("script");
+ script.type = "text/javascript";
+ script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
+ document.getElementsByTagName("head")[0].appendChild(script);
+ })();
+</script>
+
+</body>
+</html>
diff --git a/inst/doc/two-table.Rmd b/inst/doc/two-table.Rmd
index 7e5bae7..73d88a6 100644
--- a/inst/doc/two-table.Rmd
+++ b/inst/doc/two-table.Rmd
@@ -1,6 +1,5 @@
---
title: "Two-table verbs"
-date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Two-table verbs}
@@ -186,25 +185,6 @@ setdiff(df1, df2)
setdiff(df2, df1)
```
-## Databases
-
-Each two-table verb has a straightforward SQL equivalent:
-
-| R | SQL
-|------------------|--------
-| `inner_join()` | `SELECT * FROM x JOIN y ON x.a = y.a`
-| `left_join()` | `SELECT * FROM x LEFT JOIN y ON x.a = y.a`
-| `right_join()` | `SELECT * FROM x RIGHT JOIN y ON x.a = y.a`
-| `full_join()` | `SELECT * FROM x FULL JOIN y ON x.a = y.a`
-| `semi_join()` | `SELECT * FROM x WHERE EXISTS (SELECT 1 FROM y WHERE x.a = y.a)`
-| `anti_join()` | `SELECT * FROM x WHERE NOT EXISTS (SELECT 1 FROM y WHERE x.a = y.a)`
-| `intersect(x, y)`| `SELECT * FROM x INTERSECT SELECT * FROM y`
-| `union(x, y)` | `SELECT * FROM x UNION SELECT * FROM y`
-| `setdiff(x, y)` | `SELECT * FROM x EXCEPT SELECT * FROM y`
-
-`x` and `y` don't have to be tables in the same database. If you specify `copy = TRUE`, dplyr will copy the `y` table into the same location as the `x` variable. This is useful if you've downloaded a summarised dataset and determined a subset of interest that you now want the full data for. You can use `semi_join(x, y, copy = TRUE)` to upload the indices of interest to a temporary table in the same database as `x`, and then perform a efficient semi join in the database.
-
-If you're working with large data, it maybe also be helpful to set `auto_index = TRUE`. That will automatically add an index on the join variables to the temporary table.
## Coercion rules
@@ -257,4 +237,4 @@ full_join(df1, df2) %>% str()
## Multiple-table verbs
-dplyr does not provide any functions for working with three or more tables. Instead use `Reduce()`, as described in [Advanced R](http://adv-r.had.co.nz/Functionals.html#functionals-fp), to iteratively combine the two-table verbs to handle as many tables as you need.
+dplyr does not provide any functions for working with three or more tables. Instead use `purrr::reduce()` or `Reduce()`, as described in [Advanced R](http://adv-r.had.co.nz/Functionals.html#functionals-fp), to iteratively combine the two-table verbs to handle as many tables as you need.
diff --git a/inst/doc/two-table.html b/inst/doc/two-table.html
index 4af1430..9c71b9e 100644
--- a/inst/doc/two-table.html
+++ b/inst/doc/two-table.html
@@ -4,14 +4,13 @@
<head>
-<meta charset="utf-8">
+<meta charset="utf-8" />
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="generator" content="pandoc" />
<meta name="viewport" content="width=device-width, initial-scale=1">
-<meta name="date" content="2016-06-23" />
<title>Two-table verbs</title>
@@ -68,7 +67,6 @@ code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Inf
<h1 class="title toc-ignore">Two-table verbs</h1>
-<h4 class="date"><em>2016-06-23</em></h4>
@@ -85,96 +83,95 @@ code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Inf
<p>Mutating joins allow you to combine variables from multiple tables. For example, take the nycflights13 data. In one table we have flight information with an abbreviation for carrier, and in another we have a mapping between abbreviations and full names. You can use a join to add the carrier names to the flight data:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(<span class="st">"nycflights13"</span>)
<span class="co"># Drop unimportant variables so it's easier to understand the join results.</span>
-flights2 <-<span class="st"> </span>flights %>%<span class="st"> </span><span class="kw">select</span>(year:day, hour, origin, dest, tailnum, carrier)
+flights2 <-<span class="st"> </span>flights <span class="op">%>%</span><span class="st"> </span><span class="kw">select</span>(year<span class="op">:</span>day, hour, origin, dest, tailnum, carrier)
-flights2 %>%<span class="st"> </span>
+flights2 <span class="op">%>%</span><span class="st"> </span>
<span class="st"> </span><span class="kw">left_join</span>(airlines)
<span class="co">#> Joining, by = "carrier"</span>
<span class="co">#> # A tibble: 336,776 x 9</span>
-<span class="co">#> year month day hour origin dest tailnum carrier</span>
-<span class="co">#> <int> <int> <int> <dbl> <chr> <chr> <chr> <chr></span>
-<span class="co">#> 1 2013 1 1 5 EWR IAH N14228 UA</span>
-<span class="co">#> 2 2013 1 1 5 LGA IAH N24211 UA</span>
-<span class="co">#> 3 2013 1 1 5 JFK MIA N619AA AA</span>
-<span class="co">#> 4 2013 1 1 5 JFK BQN N804JB B6</span>
-<span class="co">#> 5 2013 1 1 6 LGA ATL N668DN DL</span>
-<span class="co">#> ... with 3.368e+05 more rows, and 1 more variables: name <chr></span></code></pre></div>
+<span class="co">#> year month day hour origin dest tailnum carrier name </span>
+<span class="co">#> <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr> </span>
+<span class="co">#> 1 2013 1 1 5.00 EWR IAH N14228 UA "United Air Lines …</span>
+<span class="co">#> 2 2013 1 1 5.00 LGA IAH N24211 UA "United Air Lines …</span>
+<span class="co">#> 3 2013 1 1 5.00 JFK MIA N619AA AA "American Airlines…</span>
+<span class="co">#> 4 2013 1 1 5.00 JFK BQN N804JB B6 "JetBlue Airways" </span>
+<span class="co">#> 5 2013 1 1 6.00 LGA ATL N668DN DL "Delta Air Lines I…</span>
+<span class="co">#> # ... with 3.368e+05 more rows</span></code></pre></div>
<div id="controlling-how-the-tables-are-matched" class="section level3">
<h3>Controlling how the tables are matched</h3>
<p>As well as <code>x</code> and <code>y</code>, each mutating join takes an argument <code>by</code> that controls which variables are used to match observations in the two tables. There are a few ways to specify it, as I illustrate below with various tables from nycflights13:</p>
<ul>
<li><p><code>NULL</code>, the default. dplyr will will use all variables that appear in both tables, a <strong>natural</strong> join. For example, the flights and weather tables match on their common variables: year, month, day, hour and origin.</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights2 %>%<span class="st"> </span><span class="kw">left_join</span>(weather)
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights2 <span class="op">%>%</span><span class="st"> </span><span class="kw">left_join</span>(weather)
<span class="co">#> Joining, by = c("year", "month", "day", "hour", "origin")</span>
<span class="co">#> # A tibble: 336,776 x 18</span>
-<span class="co">#> year month day hour origin dest tailnum carrier temp dewp humid</span>
-<span class="co">#> <dbl> <dbl> <int> <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl></span>
-<span class="co">#> 1 2013 1 1 5 EWR IAH N14228 UA NA NA NA</span>
-<span class="co">#> 2 2013 1 1 5 LGA IAH N24211 UA NA NA NA</span>
-<span class="co">#> 3 2013 1 1 5 JFK MIA N619AA AA NA NA NA</span>
-<span class="co">#> 4 2013 1 1 5 JFK BQN N804JB B6 NA NA NA</span>
-<span class="co">#> 5 2013 1 1 6 LGA ATL N668DN DL 39.92 26.06 57.33</span>
-<span class="co">#> ... with 3.368e+05 more rows, and 7 more variables: wind_dir <dbl>,</span>
-<span class="co">#> wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,</span>
-<span class="co">#> visib <dbl>, time_hour <time></span></code></pre></div></li>
+<span class="co">#> year month day hour origin dest tail… carr… temp dewp humid wind…</span>
+<span class="co">#> <dbl> <dbl> <int> <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl></span>
+<span class="co">#> 1 2013 1.00 1 5.00 EWR IAH N142… UA NA NA NA NA</span>
+<span class="co">#> 2 2013 1.00 1 5.00 LGA IAH N242… UA NA NA NA NA</span>
+<span class="co">#> 3 2013 1.00 1 5.00 JFK MIA N619… AA NA NA NA NA</span>
+<span class="co">#> 4 2013 1.00 1 5.00 JFK BQN N804… B6 NA NA NA NA</span>
+<span class="co">#> 5 2013 1.00 1 6.00 LGA ATL N668… DL 39.9 26.1 57.3 260</span>
+<span class="co">#> # ... with 3.368e+05 more rows, and 6 more variables: wind_speed <dbl>,</span>
+<span class="co">#> # wind_gust <dbl>, precip <dbl>, pressure <dbl>, visib <dbl>,</span>
+<span class="co">#> # time_hour <dttm></span></code></pre></div></li>
<li><p>A character vector, <code>by = "x"</code>. Like a natural join, but uses only some of the common variables. For example, <code>flights</code> and <code>planes</code> have <code>year</code> columns, but they mean different things so we only want to join by <code>tailnum</code>.</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights2 %>%<span class="st"> </span><span class="kw">left_join</span>(planes, <span class="dt">by =</span> <span class="st">"tailnum"</span>)
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights2 <span class="op">%>%</span><span class="st"> </span><span class="kw">left_join</span>(planes, <span class="dt">by =</span> <span class="st">"tailnum"</span>)
<span class="co">#> # A tibble: 336,776 x 16</span>
-<span class="co">#> year.x month day hour origin dest tailnum carrier year.y</span>
-<span class="co">#> <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <int></span>
-<span class="co">#> 1 2013 1 1 5 EWR IAH N14228 UA 1999</span>
-<span class="co">#> 2 2013 1 1 5 LGA IAH N24211 UA 1998</span>
-<span class="co">#> 3 2013 1 1 5 JFK MIA N619AA AA 1990</span>
-<span class="co">#> 4 2013 1 1 5 JFK BQN N804JB B6 2012</span>
-<span class="co">#> 5 2013 1 1 6 LGA ATL N668DN DL 1991</span>
-<span class="co">#> ... with 3.368e+05 more rows, and 7 more variables: type <chr>,</span>
-<span class="co">#> manufacturer <chr>, model <chr>, engines <int>, seats <int>,</span>
-<span class="co">#> speed <int>, engine <chr></span></code></pre></div>
+<span class="co">#> year.x month day hour orig… dest tail… carr… year… type manu… model</span>
+<span class="co">#> <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <int> <chr> <chr> <chr></span>
+<span class="co">#> 1 2013 1 1 5.00 EWR IAH N142… UA 1999 "Fix… BOEI… 737-…</span>
+<span class="co">#> 2 2013 1 1 5.00 LGA IAH N242… UA 1998 "Fix… BOEI… 737-…</span>
+<span class="co">#> 3 2013 1 1 5.00 JFK MIA N619… AA 1990 "Fix… BOEI… 757-…</span>
+<span class="co">#> 4 2013 1 1 5.00 JFK BQN N804… B6 2012 "Fix… AIRB… A320…</span>
+<span class="co">#> 5 2013 1 1 6.00 LGA ATL N668… DL 1991 "Fix… BOEI… 757-…</span>
+<span class="co">#> # ... with 3.368e+05 more rows, and 4 more variables: engines <int>,</span>
+<span class="co">#> # seats <int>, speed <int>, engine <chr></span></code></pre></div>
<p>Note that the year columns in the output are disambiguated with a suffix.</p></li>
<li><p>A named character vector: <code>by = c("x" = "a")</code>. This will match variable <code>x</code> in table <code>x</code> to variable <code>a</code> in table <code>b</code>. The variables from use will be used in the output.</p>
<p>Each flight has an origin and destination <code>airport</code>, so we need to specify which one we want to join to:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights2 %>%<span class="st"> </span><span class="kw">left_join</span>(airports, <span class="kw">c</span>(<span class="st">"dest"</span> =<span class="st"> "faa"</span>))
-<span class="co">#> # A tibble: 336,776 x 14</span>
-<span class="co">#> year month day hour origin dest tailnum carrier</span>
-<span class="co">#> <int> <int> <int> <dbl> <chr> <chr> <chr> <chr></span>
-<span class="co">#> 1 2013 1 1 5 EWR IAH N14228 UA</span>
-<span class="co">#> 2 2013 1 1 5 LGA IAH N24211 UA</span>
-<span class="co">#> 3 2013 1 1 5 JFK MIA N619AA AA</span>
-<span class="co">#> 4 2013 1 1 5 JFK BQN N804JB B6</span>
-<span class="co">#> 5 2013 1 1 6 LGA ATL N668DN DL</span>
-<span class="co">#> ... with 3.368e+05 more rows, and 6 more variables: name <chr>, lat <dbl>,</span>
-<span class="co">#> lon <dbl>, alt <int>, tz <dbl>, dst <chr></span>
-flights2 %>%<span class="st"> </span><span class="kw">left_join</span>(airports, <span class="kw">c</span>(<span class="st">"origin"</span> =<span class="st"> "faa"</span>))
-<span class="co">#> # A tibble: 336,776 x 14</span>
-<span class="co">#> year month day hour origin dest tailnum carrier name</span>
-<span class="co">#> <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr></span>
-<span class="co">#> 1 2013 1 1 5 EWR IAH N14228 UA Newark Liberty Intl</span>
-<span class="co">#> 2 2013 1 1 5 LGA IAH N24211 UA La Guardia</span>
-<span class="co">#> 3 2013 1 1 5 JFK MIA N619AA AA John F Kennedy Intl</span>
-<span class="co">#> 4 2013 1 1 5 JFK BQN N804JB B6 John F Kennedy Intl</span>
-<span class="co">#> 5 2013 1 1 6 LGA ATL N668DN DL La Guardia</span>
-<span class="co">#> ... with 3.368e+05 more rows, and 5 more variables: lat <dbl>, lon <dbl>,</span>
-<span class="co">#> alt <int>, tz <dbl>, dst <chr></span></code></pre></div></li>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights2 <span class="op">%>%</span><span class="st"> </span><span class="kw">left_join</span>(airports, <span class="kw">c</span>(<span class="st">"dest"</span> =<span class="st"> "faa"</span>))
+<span class="co">#> # A tibble: 336,776 x 15</span>
+<span class="co">#> year month day hour origin dest tail… carr… name lat lon alt</span>
+<span class="co">#> <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <int></span>
+<span class="co">#> 1 2013 1 1 5.00 EWR IAH N142… UA "Geo… 30.0 -95.3 97</span>
+<span class="co">#> 2 2013 1 1 5.00 LGA IAH N242… UA "Geo… 30.0 -95.3 97</span>
+<span class="co">#> 3 2013 1 1 5.00 JFK MIA N619… AA "Mia… 25.8 -80.3 8</span>
+<span class="co">#> 4 2013 1 1 5.00 JFK BQN N804… B6 <NA> NA NA NA</span>
+<span class="co">#> 5 2013 1 1 6.00 LGA ATL N668… DL "Har… 33.6 -84.4 1026</span>
+<span class="co">#> # ... with 3.368e+05 more rows, and 3 more variables: tz <dbl>, dst <chr>,</span>
+<span class="co">#> # tzone <chr></span>
+flights2 <span class="op">%>%</span><span class="st"> </span><span class="kw">left_join</span>(airports, <span class="kw">c</span>(<span class="st">"origin"</span> =<span class="st"> "faa"</span>))
+<span class="co">#> # A tibble: 336,776 x 15</span>
+<span class="co">#> year month day hour origin dest tail… carr… name lat lon alt</span>
+<span class="co">#> <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <int></span>
+<span class="co">#> 1 2013 1 1 5.00 EWR IAH N142… UA "New… 40.7 -74.2 18</span>
+<span class="co">#> 2 2013 1 1 5.00 LGA IAH N242… UA "La … 40.8 -73.9 22</span>
+<span class="co">#> 3 2013 1 1 5.00 JFK MIA N619… AA "Joh… 40.6 -73.8 13</span>
+<span class="co">#> 4 2013 1 1 5.00 JFK BQN N804… B6 "Joh… 40.6 -73.8 13</span>
+<span class="co">#> 5 2013 1 1 6.00 LGA ATL N668… DL "La … 40.8 -73.9 22</span>
+<span class="co">#> # ... with 3.368e+05 more rows, and 3 more variables: tz <dbl>, dst <chr>,</span>
+<span class="co">#> # tzone <chr></span></code></pre></div></li>
</ul>
</div>
<div id="types-of-join" class="section level3">
<h3>Types of join</h3>
<p>There are four types of mutating join, which differ in their behaviour when a match is not found. We’ll illustrate each with a simple example:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">(df1 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class="dt">x =</span> <span class="kw">c</span>(<span class="dv">1</span>, <span class="dv">2</span>), <span class="dt">y =</span> <span class="dv">2</span>:<span class="dv">1</span>))
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">(df1 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class="dt">x =</span> <span class="kw">c</span>(<span class="dv">1</span>, <span class="dv">2</span>), <span class="dt">y =</span> <span class="dv">2</span><span class="op">:</span><span class="dv">1</span>))
<span class="co">#> # A tibble: 2 x 2</span>
<span class="co">#> x y</span>
<span class="co">#> <dbl> <int></span>
-<span class="co">#> 1 1 2</span>
-<span class="co">#> 2 2 1</span>
+<span class="co">#> 1 1.00 2</span>
+<span class="co">#> 2 2.00 1</span>
(df2 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class="dt">x =</span> <span class="kw">c</span>(<span class="dv">1</span>, <span class="dv">3</span>), <span class="dt">a =</span> <span class="dv">10</span>, <span class="dt">b =</span> <span class="st">"a"</span>))
<span class="co">#> # A tibble: 2 x 3</span>
-<span class="co">#> x a b</span>
+<span class="co">#> x a b </span>
<span class="co">#> <dbl> <dbl> <chr></span>
-<span class="co">#> 1 1 10 a</span>
-<span class="co">#> 2 3 10 a</span></code></pre></div>
+<span class="co">#> 1 1.00 10.0 a </span>
+<span class="co">#> 2 3.00 10.0 a</span></code></pre></div>
<ul>
<li><p><code>inner_join(x, y)</code> only includes observations that match in both <code>x</code> and <code>y</code>.</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df1 %>%<span class="st"> </span><span class="kw">inner_join</span>(df2) %>%<span class="st"> </span>knitr::<span class="kw">kable</span>()
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df1 <span class="op">%>%</span><span class="st"> </span><span class="kw">inner_join</span>(df2) <span class="op">%>%</span><span class="st"> </span>knitr<span class="op">::</span><span class="kw">kable</span>()
<span class="co">#> Joining, by = "x"</span></code></pre></div>
<table>
<thead>
@@ -195,56 +192,56 @@ flights2 %>%<span class="st"> </span><span class="kw">left_join</span>(airpor
</tbody>
</table></li>
<li><p><code>left_join(x, y)</code> includes all observations in <code>x</code>, regardless of whether they match or not. This is the most commonly used join because it ensures that you don’t lose observations from your primary table.</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df1 %>%<span class="st"> </span><span class="kw">left_join</span>(df2)
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df1 <span class="op">%>%</span><span class="st"> </span><span class="kw">left_join</span>(df2)
<span class="co">#> Joining, by = "x"</span>
<span class="co">#> # A tibble: 2 x 4</span>
-<span class="co">#> x y a b</span>
+<span class="co">#> x y a b </span>
<span class="co">#> <dbl> <int> <dbl> <chr></span>
-<span class="co">#> 1 1 2 10 a</span>
-<span class="co">#> 2 2 1 NA <NA></span></code></pre></div></li>
+<span class="co">#> 1 1.00 2 10.0 a </span>
+<span class="co">#> 2 2.00 1 NA <NA></span></code></pre></div></li>
<li><p><code>right_join(x, y)</code> includes all observations in <code>y</code>. It’s equivalent to <code>left_join(y, x)</code>, but the columns will be ordered differently.</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df1 %>%<span class="st"> </span><span class="kw">right_join</span>(df2)
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df1 <span class="op">%>%</span><span class="st"> </span><span class="kw">right_join</span>(df2)
<span class="co">#> Joining, by = "x"</span>
<span class="co">#> # A tibble: 2 x 4</span>
-<span class="co">#> x y a b</span>
+<span class="co">#> x y a b </span>
<span class="co">#> <dbl> <int> <dbl> <chr></span>
-<span class="co">#> 1 1 2 10 a</span>
-<span class="co">#> 2 3 NA 10 a</span>
-df2 %>%<span class="st"> </span><span class="kw">left_join</span>(df1)
+<span class="co">#> 1 1.00 2 10.0 a </span>
+<span class="co">#> 2 3.00 NA 10.0 a</span>
+df2 <span class="op">%>%</span><span class="st"> </span><span class="kw">left_join</span>(df1)
<span class="co">#> Joining, by = "x"</span>
<span class="co">#> # A tibble: 2 x 4</span>
-<span class="co">#> x a b y</span>
+<span class="co">#> x a b y</span>
<span class="co">#> <dbl> <dbl> <chr> <int></span>
-<span class="co">#> 1 1 10 a 2</span>
-<span class="co">#> 2 3 10 a NA</span></code></pre></div></li>
+<span class="co">#> 1 1.00 10.0 a 2</span>
+<span class="co">#> 2 3.00 10.0 a NA</span></code></pre></div></li>
<li><p><code>full_join()</code> includes all observations from <code>x</code> and <code>y</code>.</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df1 %>%<span class="st"> </span><span class="kw">full_join</span>(df2)
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df1 <span class="op">%>%</span><span class="st"> </span><span class="kw">full_join</span>(df2)
<span class="co">#> Joining, by = "x"</span>
<span class="co">#> # A tibble: 3 x 4</span>
-<span class="co">#> x y a b</span>
+<span class="co">#> x y a b </span>
<span class="co">#> <dbl> <int> <dbl> <chr></span>
-<span class="co">#> 1 1 2 10 a</span>
-<span class="co">#> 2 2 1 NA <NA></span>
-<span class="co">#> 3 3 NA 10 a</span></code></pre></div></li>
+<span class="co">#> 1 1.00 2 10.0 a </span>
+<span class="co">#> 2 2.00 1 NA <NA> </span>
+<span class="co">#> 3 3.00 NA 10.0 a</span></code></pre></div></li>
</ul>
<p>The left, right and full joins are collectively know as <strong>outer joins</strong>. When a row doesn’t match in an outer join, the new variables are filled in with missing values.</p>
</div>
<div id="observations" class="section level3">
<h3>Observations</h3>
<p>While mutating joins are primarily used to add new variables, they can also generate new observations. If a match is not unique, a join will add all possible combinations (the Cartesian product) of the matching observations:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df1 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class="dt">x =</span> <span class="kw">c</span>(<span class="dv">1</span>, <span class="dv">1</span>, <span class="dv">2</span>), <span class="dt">y =</span> <span class="dv">1</span>:<span class="dv">3</span>)
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df1 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class="dt">x =</span> <span class="kw">c</span>(<span class="dv">1</span>, <span class="dv">1</span>, <span class="dv">2</span>), <span class="dt">y =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">3</span>)
df2 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class="dt">x =</span> <span class="kw">c</span>(<span class="dv">1</span>, <span class="dv">1</span>, <span class="dv">2</span>), <span class="dt">z =</span> <span class="kw">c</span>(<span class="st">"a"</span>, <span class="st">"b"</span>, <span class="st">"a"</span>))
-df1 %>%<span class="st"> </span><span class="kw">left_join</span>(df2)
+df1 <span class="op">%>%</span><span class="st"> </span><span class="kw">left_join</span>(df2)
<span class="co">#> Joining, by = "x"</span>
<span class="co">#> # A tibble: 5 x 3</span>
-<span class="co">#> x y z</span>
+<span class="co">#> x y z </span>
<span class="co">#> <dbl> <int> <chr></span>
-<span class="co">#> 1 1 1 a</span>
-<span class="co">#> 2 1 1 b</span>
-<span class="co">#> 3 1 2 a</span>
-<span class="co">#> 4 1 2 b</span>
-<span class="co">#> 5 2 3 a</span></code></pre></div>
+<span class="co">#> 1 1.00 1 a </span>
+<span class="co">#> 2 1.00 1 b </span>
+<span class="co">#> 3 1.00 2 a </span>
+<span class="co">#> 4 1.00 2 b </span>
+<span class="co">#> 5 2.00 3 a</span></code></pre></div>
</div>
</div>
<div id="filtering-joins" class="section level2">
@@ -256,30 +253,30 @@ df1 %>%<span class="st"> </span><span class="kw">left_join</span>(df2)
</ul>
<p>These are most useful for diagnosing join mismatches. For example, there are many flights in the nycflights13 dataset that don’t have a matching tail number in the planes table:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(<span class="st">"nycflights13"</span>)
-flights %>%<span class="st"> </span>
-<span class="st"> </span><span class="kw">anti_join</span>(planes, <span class="dt">by =</span> <span class="st">"tailnum"</span>) %>%<span class="st"> </span>
+flights <span class="op">%>%</span><span class="st"> </span>
+<span class="st"> </span><span class="kw">anti_join</span>(planes, <span class="dt">by =</span> <span class="st">"tailnum"</span>) <span class="op">%>%</span><span class="st"> </span>
<span class="st"> </span><span class="kw">count</span>(tailnum, <span class="dt">sort =</span> <span class="ot">TRUE</span>)
<span class="co">#> # A tibble: 722 x 2</span>
<span class="co">#> tailnum n</span>
-<span class="co">#> <chr> <int></span>
-<span class="co">#> 1 <NA> 2512</span>
-<span class="co">#> 2 N725MQ 575</span>
-<span class="co">#> 3 N722MQ 513</span>
-<span class="co">#> 4 N723MQ 507</span>
-<span class="co">#> 5 N713MQ 483</span>
-<span class="co">#> ... with 717 more rows</span></code></pre></div>
+<span class="co">#> <chr> <int></span>
+<span class="co">#> 1 <NA> 2512</span>
+<span class="co">#> 2 N725MQ 575</span>
+<span class="co">#> 3 N722MQ 513</span>
+<span class="co">#> 4 N723MQ 507</span>
+<span class="co">#> 5 N713MQ 483</span>
+<span class="co">#> # ... with 717 more rows</span></code></pre></div>
<p>If you’re worried about what observations your joins will match, start with a <code>semi_join()</code> or <code>anti_join()</code>. <code>semi_join()</code> and <code>anti_join()</code> never duplicate; they only ever remove observations.</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df1 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class="dt">x =</span> <span class="kw">c</span>(<span class="dv">1</span>, <span class="dv">1</span>, <span class="dv">3</span>, <span class="dv">4</span>), <span class="dt">y =</span> <span class="dv">1</span>:<span class="dv">4</span>)
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df1 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class="dt">x =</span> <span class="kw">c</span>(<span class="dv">1</span>, <span class="dv">1</span>, <span class="dv">3</span>, <span class="dv">4</span>), <span class="dt">y =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">4</span>)
df2 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class="dt">x =</span> <span class="kw">c</span>(<span class="dv">1</span>, <span class="dv">1</span>, <span class="dv">2</span>), <span class="dt">z =</span> <span class="kw">c</span>(<span class="st">"a"</span>, <span class="st">"b"</span>, <span class="st">"a"</span>))
<span class="co"># Four rows to start with:</span>
-df1 %>%<span class="st"> </span><span class="kw">nrow</span>()
+df1 <span class="op">%>%</span><span class="st"> </span><span class="kw">nrow</span>()
<span class="co">#> [1] 4</span>
<span class="co"># And we get four rows after the join</span>
-df1 %>%<span class="st"> </span><span class="kw">inner_join</span>(df2, <span class="dt">by =</span> <span class="st">"x"</span>) %>%<span class="st"> </span><span class="kw">nrow</span>()
+df1 <span class="op">%>%</span><span class="st"> </span><span class="kw">inner_join</span>(df2, <span class="dt">by =</span> <span class="st">"x"</span>) <span class="op">%>%</span><span class="st"> </span><span class="kw">nrow</span>()
<span class="co">#> [1] 4</span>
<span class="co"># But only two rows actually match</span>
-df1 %>%<span class="st"> </span><span class="kw">semi_join</span>(df2, <span class="dt">by =</span> <span class="st">"x"</span>) %>%<span class="st"> </span><span class="kw">nrow</span>()
+df1 <span class="op">%>%</span><span class="st"> </span><span class="kw">semi_join</span>(df2, <span class="dt">by =</span> <span class="st">"x"</span>) <span class="op">%>%</span><span class="st"> </span><span class="kw">nrow</span>()
<span class="co">#> [1] 2</span></code></pre></div>
</div>
<div id="set-operations" class="section level2">
@@ -291,13 +288,13 @@ df1 %>%<span class="st"> </span><span class="kw">semi_join</span>(df2, <span
<li><code>setdiff(x, y)</code>: return observations in <code>x</code>, but not in <code>y</code>.</li>
</ul>
<p>Given this simple data:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">(df1 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class="dt">x =</span> <span class="dv">1</span>:<span class="dv">2</span>, <span class="dt">y =</span> <span class="kw">c</span>(1L, 1L)))
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">(df1 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class="dt">x =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">2</span>, <span class="dt">y =</span> <span class="kw">c</span>(1L, 1L)))
<span class="co">#> # A tibble: 2 x 2</span>
<span class="co">#> x y</span>
<span class="co">#> <int> <int></span>
<span class="co">#> 1 1 1</span>
<span class="co">#> 2 2 1</span>
-(df2 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class="dt">x =</span> <span class="dv">1</span>:<span class="dv">2</span>, <span class="dt">y =</span> <span class="dv">1</span>:<span class="dv">2</span>))
+(df2 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class="dt">x =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">2</span>, <span class="dt">y =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">2</span>))
<span class="co">#> # A tibble: 2 x 2</span>
<span class="co">#> x y</span>
<span class="co">#> <int> <int></span>
@@ -328,62 +325,6 @@ df1 %>%<span class="st"> </span><span class="kw">semi_join</span>(df2, <span
<span class="co">#> <int> <int></span>
<span class="co">#> 1 2 2</span></code></pre></div>
</div>
-<div id="databases" class="section level2">
-<h2>Databases</h2>
-<p>Each two-table verb has a straightforward SQL equivalent:</p>
-<table style="width:39%;">
-<colgroup>
-<col width="26%"></col>
-<col width="12%"></col>
-</colgroup>
-<thead>
-<tr class="header">
-<th>R</th>
-<th>SQL</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td><code>inner_join()</code></td>
-<td><code>SELECT * FROM x JOIN y ON x.a = y.a</code></td>
-</tr>
-<tr class="even">
-<td><code>left_join()</code></td>
-<td><code>SELECT * FROM x LEFT JOIN y ON x.a = y.a</code></td>
-</tr>
-<tr class="odd">
-<td><code>right_join()</code></td>
-<td><code>SELECT * FROM x RIGHT JOIN y ON x.a = y.a</code></td>
-</tr>
-<tr class="even">
-<td><code>full_join()</code></td>
-<td><code>SELECT * FROM x FULL JOIN y ON x.a = y.a</code></td>
-</tr>
-<tr class="odd">
-<td><code>semi_join()</code></td>
-<td><code>SELECT * FROM x WHERE EXISTS (SELECT 1 FROM y WHERE x.a = y.a)</code></td>
-</tr>
-<tr class="even">
-<td><code>anti_join()</code></td>
-<td><code>SELECT * FROM x WHERE NOT EXISTS (SELECT 1 FROM y WHERE x.a = y.a)</code></td>
-</tr>
-<tr class="odd">
-<td><code>intersect(x, y)</code></td>
-<td><code>SELECT * FROM x INTERSECT SELECT * FROM y</code></td>
-</tr>
-<tr class="even">
-<td><code>union(x, y)</code></td>
-<td><code>SELECT * FROM x UNION SELECT * FROM y</code></td>
-</tr>
-<tr class="odd">
-<td><code>setdiff(x, y)</code></td>
-<td><code>SELECT * FROM x EXCEPT SELECT * FROM y</code></td>
-</tr>
-</tbody>
-</table>
-<p><code>x</code> and <code>y</code> don’t have to be tables in the same database. If you specify <code>copy = TRUE</code>, dplyr will copy the <code>y</code> table into the same location as the <code>x</code> variable. This is useful if you’ve downloaded a summarised dataset and determined a subset of interest that you now want the full data for. You can use <code>semi_join(x, y, copy = TRUE)</code> to upload the indices of interest to a temporary table in the same database as <code>x</ [...]
-<p>If you’re working with large data, it maybe also be helpful to set <code>auto_index = TRUE</code>. That will automatically add an index on the join variables to the temporary table.</p>
-</div>
<div id="coercion-rules" class="section level2">
<h2>Coercion rules</h2>
<p>When joining tables, dplyr is a little more conservative than base R about the types of variable that it considers equivalent. This is mostly likely to surprise if you’re working factors:</p>
@@ -391,27 +332,27 @@ df1 %>%<span class="st"> </span><span class="kw">semi_join</span>(df2, <span
<li><p>Factors with different levels are coerced to character with a warning:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df1 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class="dt">x =</span> <span class="dv">1</span>, <span class="dt">y =</span> <span class="kw">factor</span>(<span class="st">"a"</span>))
df2 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class="dt">x =</span> <span class="dv">2</span>, <span class="dt">y =</span> <span class="kw">factor</span>(<span class="st">"b"</span>))
-<span class="kw">full_join</span>(df1, df2) %>%<span class="st"> </span><span class="kw">str</span>()
+<span class="kw">full_join</span>(df1, df2) <span class="op">%>%</span><span class="st"> </span><span class="kw">str</span>()
<span class="co">#> Joining, by = c("x", "y")</span>
-<span class="co">#> Warning in full_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining</span>
-<span class="co">#> factors with different levels, coercing to character vector</span>
+<span class="co">#> Warning: Column `y` joining factors with different levels, coercing to</span>
+<span class="co">#> character vector</span>
<span class="co">#> Classes 'tbl_df', 'tbl' and 'data.frame': 2 obs. of 2 variables:</span>
<span class="co">#> $ x: num 1 2</span>
<span class="co">#> $ y: chr "a" "b"</span></code></pre></div></li>
<li><p>Factors with the same levels in a different order are coerced to character with a warning:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df1 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class="dt">x =</span> <span class="dv">1</span>, <span class="dt">y =</span> <span class="kw">factor</span>(<span class="st">"a"</span>, <span class="dt">levels =</span> <span class="kw">c</span>(<span class="st">"a"</span>, <span class="st">"b"</span>)))
df2 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class="dt">x =</span> <span class="dv">2</span>, <span class="dt">y =</span> <span class="kw">factor</span>(<span class="st">"b"</span>, <span class="dt">levels =</span> <span class="kw">c</span>(<span class="st">"b"</span>, <span class="st">"a"</span>)))
-<span class="kw">full_join</span>(df1, df2) %>%<span class="st"> </span><span class="kw">str</span>()
+<span class="kw">full_join</span>(df1, df2) <span class="op">%>%</span><span class="st"> </span><span class="kw">str</span>()
<span class="co">#> Joining, by = c("x", "y")</span>
-<span class="co">#> Warning in full_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining</span>
-<span class="co">#> factors with different levels, coercing to character vector</span>
+<span class="co">#> Warning: Column `y` joining factors with different levels, coercing to</span>
+<span class="co">#> character vector</span>
<span class="co">#> Classes 'tbl_df', 'tbl' and 'data.frame': 2 obs. of 2 variables:</span>
<span class="co">#> $ x: num 1 2</span>
<span class="co">#> $ y: chr "a" "b"</span></code></pre></div></li>
<li><p>Factors are preserved only if the levels match exactly:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df1 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class="dt">x =</span> <span class="dv">1</span>, <span class="dt">y =</span> <span class="kw">factor</span>(<span class="st">"a"</span>, <span class="dt">levels =</span> <span class="kw">c</span>(<span class="st">"a"</span>, <span class="st">"b"</span>)))
df2 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class="dt">x =</span> <span class="dv">2</span>, <span class="dt">y =</span> <span class="kw">factor</span>(<span class="st">"b"</span>, <span class="dt">levels =</span> <span class="kw">c</span>(<span class="st">"a"</span>, <span class="st">"b"</span>)))
-<span class="kw">full_join</span>(df1, df2) %>%<span class="st"> </span><span class="kw">str</span>()
+<span class="kw">full_join</span>(df1, df2) <span class="op">%>%</span><span class="st"> </span><span class="kw">str</span>()
<span class="co">#> Joining, by = c("x", "y")</span>
<span class="co">#> Classes 'tbl_df', 'tbl' and 'data.frame': 2 obs. of 2 variables:</span>
<span class="co">#> $ x: num 1 2</span>
@@ -419,10 +360,10 @@ df2 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class
<li><p>A factor and a character are coerced to character with a warning:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df1 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class="dt">x =</span> <span class="dv">1</span>, <span class="dt">y =</span> <span class="st">"a"</span>)
df2 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class="dt">x =</span> <span class="dv">2</span>, <span class="dt">y =</span> <span class="kw">factor</span>(<span class="st">"a"</span>))
-<span class="kw">full_join</span>(df1, df2) %>%<span class="st"> </span><span class="kw">str</span>()
+<span class="kw">full_join</span>(df1, df2) <span class="op">%>%</span><span class="st"> </span><span class="kw">str</span>()
<span class="co">#> Joining, by = c("x", "y")</span>
-<span class="co">#> Warning in full_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining</span>
-<span class="co">#> factor and character vector, coercing into character vector</span>
+<span class="co">#> Warning: Column `y` joining character vector and factor, coercing into</span>
+<span class="co">#> character vector</span>
<span class="co">#> Classes 'tbl_df', 'tbl' and 'data.frame': 2 obs. of 2 variables:</span>
<span class="co">#> $ x: num 1 2</span>
<span class="co">#> $ y: chr "a" "a"</span></code></pre></div></li>
@@ -430,7 +371,7 @@ df2 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class
<p>Otherwise logicals will be silently upcast to integer, and integer to numeric, but coercing to character will raise an error:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df1 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class="dt">x =</span> <span class="dv">1</span>, <span class="dt">y =</span> 1L)
df2 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class="dt">x =</span> <span class="dv">2</span>, <span class="dt">y =</span> <span class="fl">1.5</span>)
-<span class="kw">full_join</span>(df1, df2) %>%<span class="st"> </span><span class="kw">str</span>()
+<span class="kw">full_join</span>(df1, df2) <span class="op">%>%</span><span class="st"> </span><span class="kw">str</span>()
<span class="co">#> Joining, by = c("x", "y")</span>
<span class="co">#> Classes 'tbl_df', 'tbl' and 'data.frame': 2 obs. of 2 variables:</span>
<span class="co">#> $ x: num 1 2</span>
@@ -438,13 +379,13 @@ df2 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class
df1 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class="dt">x =</span> <span class="dv">1</span>, <span class="dt">y =</span> 1L)
df2 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class="dt">x =</span> <span class="dv">2</span>, <span class="dt">y =</span> <span class="st">"a"</span>)
-<span class="kw">full_join</span>(df1, df2) %>%<span class="st"> </span><span class="kw">str</span>()
+<span class="kw">full_join</span>(df1, df2) <span class="op">%>%</span><span class="st"> </span><span class="kw">str</span>()
<span class="co">#> Joining, by = c("x", "y")</span>
-<span class="co">#> Error in eval(expr, envir, enclos): Can't join on 'y' x 'y' because of incompatible types (character / integer)</span></code></pre></div>
+<span class="co">#> Error in full_join_impl(x, y, by$x, by$y, suffix$x, suffix$y, check_na_matches(na_matches)): Can't join on 'y' x 'y' because of incompatible types (character / integer)</span></code></pre></div>
</div>
<div id="multiple-table-verbs" class="section level2">
<h2>Multiple-table verbs</h2>
-<p>dplyr does not provide any functions for working with three or more tables. Instead use <code>Reduce()</code>, as described in <a href="http://adv-r.had.co.nz/Functionals.html#functionals-fp">Advanced R</a>, to iteratively combine the two-table verbs to handle as many tables as you need.</p>
+<p>dplyr does not provide any functions for working with three or more tables. Instead use <code>purrr::reduce()</code> or <code>Reduce()</code>, as described in <a href="http://adv-r.had.co.nz/Functionals.html#functionals-fp">Advanced R</a>, to iteratively combine the two-table verbs to handle as many tables as you need.</p>
</div>
@@ -454,7 +395,7 @@ df2 <-<span class="st"> </span><span class="kw">data_frame</span>(<span class
(function () {
var script = document.createElement("script");
script.type = "text/javascript";
- script.src = "https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
+ script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
document.getElementsByTagName("head")[0].appendChild(script);
})();
</script>
diff --git a/inst/doc/window-functions.R b/inst/doc/window-functions.R
index 015adce..1f27b76 100644
--- a/inst/doc/window-functions.R
+++ b/inst/doc/window-functions.R
@@ -1,28 +1,35 @@
-## ---- echo = FALSE, message = FALSE--------------------------------------
+## ---- include = FALSE----------------------------------------------------
knitr::opts_chunk$set(collapse = T, comment = "#>")
options(tibble.print_min = 4L, tibble.print_max = 4L)
library(dplyr)
+set.seed(1014)
-## ---- results = "hide"---------------------------------------------------
+## ------------------------------------------------------------------------
library(Lahman)
-batting <- select(tbl_df(Batting), playerID, yearID, teamID, G, AB:H)
-batting <- arrange(batting, playerID, yearID, teamID)
-players <- group_by(batting, playerID)
-
-# For each player, find the two years with most hits
-filter(players, min_rank(desc(H)) <= 2 & H > 0)
-# Within each player, rank each year by the number of games played
-mutate(players, G_rank = min_rank(G))
-
-# For each player, find every year that was better than the previous year
-filter(players, G > lag(G))
-# For each player, compute avg change in games played per year
-mutate(players, G_change = (G - lag(G)) / (yearID - lag(yearID)))
-
-# For each player, find all where they played more games than average
-filter(players, G > mean(G))
-# For each, player compute a z score based on number of games played
-mutate(players, G_z = (G - mean(G)) / sd(G))
+
+batting <- Lahman::Batting %>%
+ as_tibble() %>%
+ select(playerID, yearID, teamID, G, AB:H) %>%
+ arrange(playerID, yearID, teamID) %>%
+ semi_join(Lahman::AwardsPlayers, by = "playerID")
+
+players <- batting %>% group_by(playerID)
+
+## ---- eval = FALSE-------------------------------------------------------
+# # For each player, find the two years with most hits
+# filter(players, min_rank(desc(H)) <= 2 & H > 0)
+# # Within each player, rank each year by the number of games played
+# mutate(players, G_rank = min_rank(G))
+#
+# # For each player, find every year that was better than the previous year
+# filter(players, G > lag(G))
+# # For each player, compute avg change in games played per year
+# mutate(players, G_change = (G - lag(G)) / (yearID - lag(yearID)))
+#
+# # For each player, find all where they played more games than average
+# filter(players, G > mean(G))
+# # For each, player compute a z score based on number of games played
+# mutate(players, G_z = (G - mean(G)) / sd(G))
## ------------------------------------------------------------------------
x <- c(1, 1, 2, 2, 2)
@@ -35,11 +42,7 @@ dense_rank(x)
cume_dist(x)
percent_rank(x)
-## ---- results = 'hide'---------------------------------------------------
-# Selects best two years
-filter(players, min_rank(desc(G)) < 2)
-
-# Selects best 10% of years
+## ------------------------------------------------------------------------
filter(players, cume_dist(desc(G)) < 0.1)
## ------------------------------------------------------------------------
@@ -71,20 +74,20 @@ arrange(wrong, year)
right <- mutate(scrambled, running = order_by(year, cumsum(value)))
arrange(right, year)
-## ---- results = "hide"---------------------------------------------------
-filter(players, cumany(G > 150))
+## ---- eval = FALSE-------------------------------------------------------
+# filter(players, cumany(G > 150))
## ------------------------------------------------------------------------
x <- 1:10
y <- 10:1
order_by(y, cumsum(x))
-## ---- results = "hide"---------------------------------------------------
-filter(players, G > mean(G))
-filter(players, G < median(G))
+## ---- eval = FALSE-------------------------------------------------------
+# filter(players, G > mean(G))
+# filter(players, G < median(G))
-## ---- results = "hide"---------------------------------------------------
-filter(players, ntile(G, 2) == 2)
+## ---- eval = FALSE-------------------------------------------------------
+# filter(players, ntile(G, 2) == 2)
## ------------------------------------------------------------------------
mutate(players, career_year = yearID - min(yearID) + 1)
@@ -92,32 +95,3 @@ mutate(players, career_year = yearID - min(yearID) + 1)
## ------------------------------------------------------------------------
mutate(players, G_z = (G - mean(G)) / sd(G))
-## ---- message = FALSE----------------------------------------------------
-if (has_lahman("postgres")) {
- players_db <- group_by(tbl(lahman_postgres(), "Batting"), playerID)
-
- print(translate_sql(mean(G), tbl = players_db, window = TRUE))
- print(translate_sql(cummean(G), tbl = players_db, window = TRUE))
- print(translate_sql(rank(G), tbl = players_db, window = TRUE))
- print(translate_sql(ntile(G, 2), tbl = players_db, window = TRUE))
- print(translate_sql(lag(G), tbl = players_db, window = TRUE))
-}
-
-## ---- message = FALSE----------------------------------------------------
-if (has_lahman("postgres")) {
- players_by_year <- arrange(players_db, yearID)
- print(translate_sql(cummean(G), tbl = players_by_year, window = TRUE))
- print(translate_sql(rank(), tbl = players_by_year, window = TRUE))
- print(translate_sql(lag(G), tbl = players_by_year, window = TRUE))
-}
-
-## ---- eval = FALSE-------------------------------------------------------
-# mutate(players,
-# min_rank(yearID),
-# order_by(yearID, cumsum(G)),
-# lead(order_by = yearID, G)
-# )
-
-## ---- eval = FALSE-------------------------------------------------------
-# filter(players, rank(G) == 1)
-
diff --git a/inst/doc/window-functions.Rmd b/inst/doc/window-functions.Rmd
index 88dcc27..b89b9a0 100644
--- a/inst/doc/window-functions.Rmd
+++ b/inst/doc/window-functions.Rmd
@@ -1,29 +1,38 @@
---
-title: "Window functions and grouped mutate/filter"
-date: "`r Sys.Date()`"
+title: "Window functions"
output: rmarkdown::html_vignette
vignette: >
- %\VignetteIndexEntry{Window functions and grouped mutate/filter}
+ %\VignetteIndexEntry{Window functions}
%\VignetteEngine{knitr::rmarkdown}
%\usepackage[utf8]{inputenc}
---
-```{r, echo = FALSE, message = FALSE}
+```{r, include = FALSE}
knitr::opts_chunk$set(collapse = T, comment = "#>")
options(tibble.print_min = 4L, tibble.print_max = 4L)
library(dplyr)
+set.seed(1014)
```
A __window function__ is a variation on an aggregation function. Where an aggregation function, like `sum()` and `mean()`, takes n inputs and return a single value, a window function returns n values. The output of a window function depends on all its input values, so window functions don't include functions that work element-wise, like `+` or `round()`. Window functions include variations on aggregate functions, like `cumsum()` and `cummean()`, functions for ranking and ordering, like ` [...]
-Window functions are used in conjunction with `mutate` and `filter` to solve a wide range of problems, some of which are shown below:
+In this vignette, we'll use a small sample of the Lahman batting dataset, including the players that have won an award.
-```{r, results = "hide"}
+```{r}
library(Lahman)
-batting <- select(tbl_df(Batting), playerID, yearID, teamID, G, AB:H)
-batting <- arrange(batting, playerID, yearID, teamID)
-players <- group_by(batting, playerID)
+batting <- Lahman::Batting %>%
+ as_tibble() %>%
+ select(playerID, yearID, teamID, G, AB:H) %>%
+ arrange(playerID, yearID, teamID) %>%
+ semi_join(Lahman::AwardsPlayers, by = "playerID")
+
+players <- batting %>% group_by(playerID)
+```
+
+Window functions are used in conjunction with `mutate()` and `filter()` to solve a wide range of problems. Here's a selection:
+
+```{r, eval = FALSE}
# For each player, find the two years with most hits
filter(players, min_rank(desc(H)) <= 2 & H > 0)
# Within each player, rank each year by the number of games played
@@ -40,15 +49,13 @@ filter(players, G > mean(G))
mutate(players, G_z = (G - mean(G)) / sd(G))
```
-This vignette is broken down into two sections. First you'll learn about the five families of window functions in R, and what you can use them for. If you're only working with local data sources, you can stop there. Otherwise, continue on to learn about window functions in SQL. They are relatively new, but are supported by Postgres, Amazon's Redshift and Google's bigquery. The window functions themselves are basically the same (modulo a few name conflicts), but their specification is a l [...]
-
-Before reading this vignette, you should be familiar with `mutate()` and `filter()`. If you want to use window functions with SQL databases, you should also be familiar with the basics of dplyr's SQL translation.
+Before reading this vignette, you should be familiar with `mutate()` and `filter()`.
## Types of window functions
There are five main families of window functions. Two families are unrelated to aggregation functions:
-* Ranking and ordering functions: `row_number()`, `min_rank` (`RANK` in SQL),
+* Ranking and ordering functions: `row_number()`, `min_rank()`,
`dense_rank()`, `cume_dist()`, `percent_rank()`, and `ntile()`. These
functions all take a vector to order by, and return various types of ranks.
@@ -63,7 +70,7 @@ The other three families are variations on familiar aggregate functions:
* Rolling aggregates operate in a fixed width window. You won't find them in
base R or in dplyr, but there are many implementations in
other packages, such as
- [RcppRoll](http://cran.r-project.org/package=RcppRoll).
+ [RcppRoll](https://cran.r-project.org/package=RcppRoll).
* Recycled aggregates, where an aggregate is repeated to match the length
of the input. These are not needed in R because vector recycling
@@ -73,7 +80,7 @@ The other three families are variations on familiar aggregate functions:
Each family is described in more detail below, focussing on the general goals and how to use them with dplyr. For more details, refer to the individual function documentation.
-### Ranking functions
+## Ranking functions
The ranking functions are variations on a theme, differing in how they handle ties:
@@ -96,11 +103,7 @@ percent_rank(x)
These are useful if you want to select (for example) the top 10% of records within each group. For example:
-```{r, results = 'hide'}
-# Selects best two years
-filter(players, min_rank(desc(G)) < 2)
-
-# Selects best 10% of years
+```{r}
filter(players, cume_dist(desc(G)) < 0.1)
```
@@ -115,7 +118,7 @@ summarise(by_team_quartile, mean(G))
All ranking functions rank from lowest to highest so that small input values get small ranks. Use `desc()` to rank from highest to lowest.
-### Lead and lag
+## Lead and lag
`lead()` and `lag()` produce offset versions of a input vector that is either ahead of or behind the original vector.
@@ -159,13 +162,13 @@ right <- mutate(scrambled, running = order_by(year, cumsum(value)))
arrange(right, year)
```
-### Cumulative aggregates
+## Cumulative aggregates
Base R provides cumulative sum (`cumsum()`), cumulative min (`cummin()`) and cumulative max (`cummax()`). (It also provides `cumprod()` but that is rarely useful). Other common accumulating functions are `cumany()` and `cumall()`, cumulative versions of `||` and `&&`, and `cummean()`, a cumulative mean. These are not included in base R, but efficient versions are provided by `dplyr`.
`cumany()` and `cumall()` are useful for selecting all rows up to, or all rows after, a condition is true for the first (or last) time. For example, we can use `cumany()` to find all records for a player after they played a year with 150 games:
-```{r, results = "hide"}
+```{r, eval = FALSE}
filter(players, cumany(G > 150))
```
@@ -179,18 +182,18 @@ order_by(y, cumsum(x))
This function uses a bit of non-standard evaluation, so I wouldn't recommend using it inside another function; use the simpler but less concise `with_order()` instead.
-### Recycled aggregates
+## Recycled aggregates
R's vector recycling make it easy to select values that are higher or lower than a summary. I call this a recycled aggregate because the value of the aggregate is recycled to be the same length as the original vector. Recycled aggregates are useful if you want to find all records greater than the mean or less than the median:
-```{r, results = "hide"}
+```{r, eval = FALSE}
filter(players, G > mean(G))
filter(players, G < median(G))
```
While most SQL databases don't have an equivalent of `median()` or `quantile()`, when filtering you can achieve the same effect with `ntile()`. For example, `x > median(x)` is equivalent to `ntile(x, 2) == 2`; `x > quantile(x, 75)` is equivalent to `ntile(x, 100) > 75` or `ntile(x, 4) > 3`.
-```{r, results = "hide"}
+```{r, eval = FALSE}
filter(players, ntile(G, 2) == 2)
```
@@ -207,166 +210,3 @@ Or, as in the introductory example, we could compute a z-score:
```{r}
mutate(players, G_z = (G - mean(G)) / sd(G))
```
-
-## Window functions in SQL
-
-Window functions have a slightly different flavour in SQL. The syntax is a little different, and the cumulative, rolling and recycled aggregate functions are all based on the simple aggregate function. The goal in this section is not to tell you everything you need to know about window functions in SQL, but to remind you of the basics and show you how dplyr translates your R expressions in to SQL.
-
-### Structure of a window function in SQL
-
-In SQL, window functions have the form `[expression] OVER ([partition clause] [order clause] [frame_clause])`:
-
-* The __expression__ is a combination of variable names and window functions.
- Support for window functions varies from database to database, but most
- support the ranking functions, `lead`, `lag`, `nth`, `first`,
- `last`, `count`, `min`, `max`, `sum`, `avg` and `stddev`. dplyr
- generates this from the R expression in your `mutate` or `filter` call.
-
-* The __partition clause__ specifies how the window function is broken down
- over groups. It plays an analogous role to `GROUP BY` for aggregate functions,
- and `group_by()` in dplyr. It is possible for different window functions to
- be partitioned into different groups, but not all databases support it, and
- neither does dplyr.
-
-* The __order clause__ controls the ordering (when it makes a difference).
- This is important for the ranking functions since it specifies which
- variables to rank by, but it's also needed for cumulative functions and lead.
- Whenever you're thinking about before and after in SQL, you must always tell
- it which variable defines the order. In dplyr you do this with `arrange()`.
- If the order clause is missing when needed, some databases fail with an
- error message while others return non-deterministic results.
-
-* The __frame clause__ defines which rows, or __frame__, that are passed
- to the window function, describing which rows (relative to the current row)
- should be included. The frame clause provides two offsets which determine
- the start and end of frame. There are three special values: -Inf means
- to include all preceeding rows (in SQL, "unbounded preceding"), 0 means the
- current row ("current row"), and Inf means all following rows ("unbounded
- following)". The complete set of options is comprehensive, but fairly
- confusing, and is summarised visually below.
-
- ![A visual summary of frame options](windows.png)
-
- Of the many possible specifications, there are only three that commonly
- used. They select between aggregation variants:
-
- * Recycled: `BETWEEN UNBOUND PRECEEDING AND UNBOUND FOLLOWING`
-
- * Cumulative: `BETWEEN UNBOUND PRECEEDING AND CURRENT ROW`
-
- * Rolling: `BETWEEN 2 PRECEEDING AND 2 FOLLOWING`
-
- dplyr generates the frame clause based on whether your using a recycled
- aggregate or a cumulative aggregate.
-
-It's easiest to understand these specifications by looking at a few examples. Simple examples just need the partition and order clauses:
-
-* Rank each year within a player by number of home runs:
- `RANK() OVER (PARTITION BY playerID ORDER BY desc(H))`
-
-* Compute change in number of games from one year to the next:
- `G - LAG(G) OVER (PARTITION G playerID ORDER BY yearID)`
-
-Aggregate variants are more verbose because we also need to supply the frame clause:
-
-* Running sum of G for each player: `SUM(G) OVER (PARTITION BY playerID ORDER BY yearID BETWEEN UNBOUND PRECEEDING AND CURRENT ROW)`
-
-* Compute the career year: `YearID - min(YearID) OVER (PARTITION BY playerID BETWEEN UNBOUND PRECEEDING AND UNBOUND FOLLOWING) + 1`
-
-* Compute a rolling average of games player: `MEAN(G) OVER (PARTITION BY playerID ORDER BY yearID BETWEEN 2 PRECEEDING AND 2 FOLLOWING)`
-
-You'll notice that window functions in SQL are more verbose than in R. This is because different window functions can have different partitions, and the frame specification is more general than the two aggregate variants (recycled and cumulative) provided by dplyr. dplyr makes a tradeoff: you can't access rarely used window function capabilities (unless you write raw SQL), but in return, common operations are much more succinct.
-
-### Translating dplyr to SQL
-
-To see how individual window functions are translated to SQL, we can use `translate_sql()` with the argument `window = TRUE`.
-
-```{r, message = FALSE}
-if (has_lahman("postgres")) {
- players_db <- group_by(tbl(lahman_postgres(), "Batting"), playerID)
-
- print(translate_sql(mean(G), tbl = players_db, window = TRUE))
- print(translate_sql(cummean(G), tbl = players_db, window = TRUE))
- print(translate_sql(rank(G), tbl = players_db, window = TRUE))
- print(translate_sql(ntile(G, 2), tbl = players_db, window = TRUE))
- print(translate_sql(lag(G), tbl = players_db, window = TRUE))
-}
-```
-
-If the tbl has been arranged previously, then that ordering will be used for the order clause:
-
-```{r, message = FALSE}
-if (has_lahman("postgres")) {
- players_by_year <- arrange(players_db, yearID)
- print(translate_sql(cummean(G), tbl = players_by_year, window = TRUE))
- print(translate_sql(rank(), tbl = players_by_year, window = TRUE))
- print(translate_sql(lag(G), tbl = players_by_year, window = TRUE))
-}
-```
-
-There are some challenges when translating window functions between R and SQL, because dplyr tries to keep the window functions as similar as possible to both the existing R analogues and to the SQL functions. This means that there are three ways to control the order clause depending on which window function you're using:
-
-* For ranking functions, the ordering variable is the first argument: `rank(x)`,
- `ntile(y, 2)`. If omitted or `NULL`, will use the default ordering associated
- with the tbl (as set by `arrange()`).
-
-* Accumulating aggegates only take a single argument (the vector to aggregate).
- To control ordering, use `order_by()`.
-
-* Aggregates implemented in dplyr (`lead`, `lag`, `nth_value`, `first_value`,
- `last_value`) have an `order_by` argument. Supply it to override the
- default ordering.
-
-The three options are illustrated in the snippet below:
-
-```{r, eval = FALSE}
-mutate(players,
- min_rank(yearID),
- order_by(yearID, cumsum(G)),
- lead(order_by = yearID, G)
-)
-```
-
-Currently there is no way to order by multiple variables, except by setting the default ordering with `arrange()`. This will be added in a future release.
-
-### Translating filters based on window functions
-
-There are some restrictions on window functions in SQL that make their use with `WHERE` somewhat challenging. Take this simple example, where we want to find the year each player played the most games:
-
-```{r, eval = FALSE}
-filter(players, rank(G) == 1)
-```
-
-The following straightforward translation does not work because window functions are only allowed in `SELECT` and `ORDER_BY`.
-
-```
-SELECT *
-FROM Batting
-WHERE rank() OVER (PARTITION BY "playerID" ORDER BY "G") = 1;
-```
-
-Computing the window function in `SELECT` and referring to it in `WHERE` or `HAVING` doesn't work either, because `WHERE` and `HAVING` are computed before windowing functions.
-
-```
-SELECT *, rank() OVER (PARTITION BY "playerID" ORDER BY "G") as rank
-FROM Batting
-WHERE rank = 1;
-
-SELECT *, rank() OVER (PARTITION BY "playerID" ORDER BY "G") as rank
-FROM Batting
-HAVING rank = 1;
-```
-
-Instead, we must use a subquery:
-
-```
-SELECT *
-FROM (
- SELECT *, rank() OVER (PARTITION BY "playerID" ORDER BY "G") as rank
- FROM Batting
-) tmp
-WHERE rank = 1;
-```
-
-And even that query is a slightly simplification because it will also add a rank column to the original columns. dplyr takes care of generating the full, verbose, query, so you can focus on your data analysis challenges.
-
diff --git a/inst/doc/window-functions.html b/inst/doc/window-functions.html
index 593a193..94e3fe4 100644
--- a/inst/doc/window-functions.html
+++ b/inst/doc/window-functions.html
@@ -4,16 +4,15 @@
<head>
-<meta charset="utf-8">
+<meta charset="utf-8" />
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="generator" content="pandoc" />
<meta name="viewport" content="width=device-width, initial-scale=1">
-<meta name="date" content="2016-06-23" />
-<title>Window functions and grouped mutate/filter</title>
+<title>Window functions</title>
@@ -67,50 +66,54 @@ code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Inf
-<h1 class="title toc-ignore">Window functions and grouped mutate/filter</h1>
-<h4 class="date"><em>2016-06-23</em></h4>
+<h1 class="title toc-ignore">Window functions</h1>
<p>A <strong>window function</strong> is a variation on an aggregation function. Where an aggregation function, like <code>sum()</code> and <code>mean()</code>, takes n inputs and return a single value, a window function returns n values. The output of a window function depends on all its input values, so window functions don’t include functions that work element-wise, like <code>+</code> or <code>round()</code>. Window functions include variations on aggregate functions, like <code>cums [...]
-<p>Window functions are used in conjunction with <code>mutate</code> and <code>filter</code> to solve a wide range of problems, some of which are shown below:</p>
+<p>In this vignette, we’ll use a small sample of the Lahman batting dataset, including the players that have won an award.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(Lahman)
-batting <-<span class="st"> </span><span class="kw">select</span>(<span class="kw">tbl_df</span>(Batting), playerID, yearID, teamID, G, AB:H)
-batting <-<span class="st"> </span><span class="kw">arrange</span>(batting, playerID, yearID, teamID)
-players <-<span class="st"> </span><span class="kw">group_by</span>(batting, playerID)
-<span class="co"># For each player, find the two years with most hits</span>
-<span class="kw">filter</span>(players, <span class="kw">min_rank</span>(<span class="kw">desc</span>(H)) <=<span class="st"> </span><span class="dv">2</span> &<span class="st"> </span>H ><span class="st"> </span><span class="dv">0</span>)
+batting <-<span class="st"> </span>Lahman<span class="op">::</span>Batting <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">as_tibble</span>() <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">select</span>(playerID, yearID, teamID, G, AB<span class="op">:</span>H) <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">arrange</span>(playerID, yearID, teamID) <span class="op">%>%</span>
+<span class="st"> </span><span class="kw">semi_join</span>(Lahman<span class="op">::</span>AwardsPlayers, <span class="dt">by =</span> <span class="st">"playerID"</span>)
+
+players <-<span class="st"> </span>batting <span class="op">%>%</span><span class="st"> </span><span class="kw">group_by</span>(playerID)</code></pre></div>
+<p>Window functions are used in conjunction with <code>mutate()</code> and <code>filter()</code> to solve a wide range of problems. Here’s a selection:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># For each player, find the two years with most hits</span>
+<span class="kw">filter</span>(players, <span class="kw">min_rank</span>(<span class="kw">desc</span>(H)) <span class="op"><=</span><span class="st"> </span><span class="dv">2</span> <span class="op">&</span><span class="st"> </span>H <span class="op">></span><span class="st"> </span><span class="dv">0</span>)
<span class="co"># Within each player, rank each year by the number of games played</span>
<span class="kw">mutate</span>(players, <span class="dt">G_rank =</span> <span class="kw">min_rank</span>(G))
<span class="co"># For each player, find every year that was better than the previous year</span>
-<span class="kw">filter</span>(players, G ><span class="st"> </span><span class="kw">lag</span>(G))
+<span class="kw">filter</span>(players, G <span class="op">></span><span class="st"> </span><span class="kw">lag</span>(G))
<span class="co"># For each player, compute avg change in games played per year</span>
-<span class="kw">mutate</span>(players, <span class="dt">G_change =</span> (G -<span class="st"> </span><span class="kw">lag</span>(G)) /<span class="st"> </span>(yearID -<span class="st"> </span><span class="kw">lag</span>(yearID)))
+<span class="kw">mutate</span>(players, <span class="dt">G_change =</span> (G <span class="op">-</span><span class="st"> </span><span class="kw">lag</span>(G)) <span class="op">/</span><span class="st"> </span>(yearID <span class="op">-</span><span class="st"> </span><span class="kw">lag</span>(yearID)))
<span class="co"># For each player, find all where they played more games than average</span>
-<span class="kw">filter</span>(players, G ><span class="st"> </span><span class="kw">mean</span>(G))
+<span class="kw">filter</span>(players, G <span class="op">></span><span class="st"> </span><span class="kw">mean</span>(G))
<span class="co"># For each, player compute a z score based on number of games played</span>
-<span class="kw">mutate</span>(players, <span class="dt">G_z =</span> (G -<span class="st"> </span><span class="kw">mean</span>(G)) /<span class="st"> </span><span class="kw">sd</span>(G))</code></pre></div>
-<p>This vignette is broken down into two sections. First you’ll learn about the five families of window functions in R, and what you can use them for. If you’re only working with local data sources, you can stop there. Otherwise, continue on to learn about window functions in SQL. They are relatively new, but are supported by Postgres, Amazon’s Redshift and Google’s bigquery. The window functions themselves are basically the same (modulo a few name conflicts), but their specification is [...]
-<p>Before reading this vignette, you should be familiar with <code>mutate()</code> and <code>filter()</code>. If you want to use window functions with SQL databases, you should also be familiar with the basics of dplyr’s SQL translation.</p>
+<span class="kw">mutate</span>(players, <span class="dt">G_z =</span> (G <span class="op">-</span><span class="st"> </span><span class="kw">mean</span>(G)) <span class="op">/</span><span class="st"> </span><span class="kw">sd</span>(G))</code></pre></div>
+<p>Before reading this vignette, you should be familiar with <code>mutate()</code> and <code>filter()</code>.</p>
<div id="types-of-window-functions" class="section level2">
<h2>Types of window functions</h2>
<p>There are five main families of window functions. Two families are unrelated to aggregation functions:</p>
<ul>
-<li><p>Ranking and ordering functions: <code>row_number()</code>, <code>min_rank</code> (<code>RANK</code> in SQL), <code>dense_rank()</code>, <code>cume_dist()</code>, <code>percent_rank()</code>, and <code>ntile()</code>. These functions all take a vector to order by, and return various types of ranks.</p></li>
+<li><p>Ranking and ordering functions: <code>row_number()</code>, <code>min_rank()</code>, <code>dense_rank()</code>, <code>cume_dist()</code>, <code>percent_rank()</code>, and <code>ntile()</code>. These functions all take a vector to order by, and return various types of ranks.</p></li>
<li><p>Offsets <code>lead()</code> and <code>lag()</code> allow you to access the previous and next values in a vector, making it easy to compute differences and trends.</p></li>
</ul>
<p>The other three families are variations on familiar aggregate functions:</p>
<ul>
<li><p>Cumulative aggregates: <code>cumsum()</code>, <code>cummin()</code>, <code>cummax()</code> (from base R), and <code>cumall()</code>, <code>cumany()</code>, and <code>cummean()</code> (from dplyr).</p></li>
-<li><p>Rolling aggregates operate in a fixed width window. You won’t find them in base R or in dplyr, but there are many implementations in other packages, such as <a href="http://cran.r-project.org/package=RcppRoll">RcppRoll</a>.</p></li>
+<li><p>Rolling aggregates operate in a fixed width window. You won’t find them in base R or in dplyr, but there are many implementations in other packages, such as <a href="https://cran.r-project.org/package=RcppRoll">RcppRoll</a>.</p></li>
<li><p>Recycled aggregates, where an aggregate is repeated to match the length of the input. These are not needed in R because vector recycling automatically recycles aggregates where needed. They are important in SQL, because the presence of an aggregation function usually tells the database to return only one row per group.</p></li>
</ul>
<p>Each family is described in more detail below, focussing on the general goals and how to use them with dplyr. For more details, refer to the individual function documentation.</p>
-<div id="ranking-functions" class="section level3">
-<h3>Ranking functions</h3>
+</div>
+<div id="ranking-functions" class="section level2">
+<h2>Ranking functions</h2>
<p>The ranking functions are variations on a theme, differing in how they handle ties:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">x <-<span class="st"> </span><span class="kw">c</span>(<span class="dv">1</span>, <span class="dv">1</span>, <span class="dv">2</span>, <span class="dv">2</span>, <span class="dv">2</span>)
@@ -127,29 +130,34 @@ players <-<span class="st"> </span><span class="kw">group_by</span>(batting,
<span class="kw">percent_rank</span>(x)
<span class="co">#> [1] 0.0 0.0 0.5 0.5 0.5</span></code></pre></div>
<p>These are useful if you want to select (for example) the top 10% of records within each group. For example:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Selects best two years</span>
-<span class="kw">filter</span>(players, <span class="kw">min_rank</span>(<span class="kw">desc</span>(G)) <<span class="st"> </span><span class="dv">2</span>)
-
-<span class="co"># Selects best 10% of years</span>
-<span class="kw">filter</span>(players, <span class="kw">cume_dist</span>(<span class="kw">desc</span>(G)) <<span class="st"> </span><span class="fl">0.1</span>)</code></pre></div>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">filter</span>(players, <span class="kw">cume_dist</span>(<span class="kw">desc</span>(G)) <span class="op"><</span><span class="st"> </span><span class="fl">0.1</span>)
+<span class="co">#> # A tibble: 995 x 7</span>
+<span class="co">#> # Groups: playerID [906]</span>
+<span class="co">#> playerID yearID teamID G AB R H</span>
+<span class="co">#> <chr> <int> <fctr> <int> <int> <int> <int></span>
+<span class="co">#> 1 aaronha01 1963 ML1 161 631 121 201</span>
+<span class="co">#> 2 aaronha01 1968 ATL 160 606 84 174</span>
+<span class="co">#> 3 abbotji01 1991 CAL 34 NA NA NA</span>
+<span class="co">#> 4 abernte02 1965 CHN 84 18 1 3</span>
+<span class="co">#> # ... with 991 more rows</span></code></pre></div>
<p>Finally, <code>ntile()</code> divides the data up into <code>n</code> evenly sized buckets. It’s a coarse ranking, and it can be used in with <code>mutate()</code> to divide the data into buckets for further summary. For example, we could use <code>ntile()</code> to divide the players within a team into four ranked groups, and calculate the average number of games within each group.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">by_team_player <-<span class="st"> </span><span class="kw">group_by</span>(batting, teamID, playerID)
by_team <-<span class="st"> </span><span class="kw">summarise</span>(by_team_player, <span class="dt">G =</span> <span class="kw">sum</span>(G))
by_team_quartile <-<span class="st"> </span><span class="kw">group_by</span>(by_team, <span class="dt">quartile =</span> <span class="kw">ntile</span>(G, <span class="dv">4</span>))
<span class="kw">summarise</span>(by_team_quartile, <span class="kw">mean</span>(G))
<span class="co">#> # A tibble: 4 x 2</span>
-<span class="co">#> quartile mean(G)</span>
-<span class="co">#> <int> <dbl></span>
-<span class="co">#> 1 1 5.326246</span>
-<span class="co">#> 2 2 24.698241</span>
-<span class="co">#> 3 3 76.878375</span>
-<span class="co">#> 4 4 372.300738</span></code></pre></div>
+<span class="co">#> quartile `mean(G)`</span>
+<span class="co">#> <int> <dbl></span>
+<span class="co">#> 1 1 27.2</span>
+<span class="co">#> 2 2 97.6</span>
+<span class="co">#> 3 3 272 </span>
+<span class="co">#> 4 4 976</span></code></pre></div>
<p>All ranking functions rank from lowest to highest so that small input values get small ranks. Use <code>desc()</code> to rank from highest to lowest.</p>
</div>
-<div id="lead-and-lag" class="section level3">
-<h3>Lead and lag</h3>
+<div id="lead-and-lag" class="section level2">
+<h2>Lead and lag</h2>
<p><code>lead()</code> and <code>lag()</code> produce offset versions of a input vector that is either ahead of or behind the original vector.</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">x <-<span class="st"> </span><span class="dv">1</span>:<span class="dv">5</span>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">x <-<span class="st"> </span><span class="dv">1</span><span class="op">:</span><span class="dv">5</span>
<span class="kw">lead</span>(x)
<span class="co">#> [1] 2 3 4 5 NA</span>
<span class="kw">lag</span>(x)
@@ -158,15 +166,15 @@ by_team_quartile <-<span class="st"> </span><span class="kw">group_by</span>(
<ul>
<li><p>Compute differences or percent changes.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Compute the relative change in games played</span>
-<span class="kw">mutate</span>(players, <span class="dt">G_delta =</span> G -<span class="st"> </span><span class="kw">lag</span>(G))</code></pre></div>
+<span class="kw">mutate</span>(players, <span class="dt">G_delta =</span> G <span class="op">-</span><span class="st"> </span><span class="kw">lag</span>(G))</code></pre></div>
<p>Using <code>lag()</code> is more convenient than <code>diff()</code> because for <code>n</code> inputs <code>diff()</code> returns <code>n - 1</code> outputs.</p></li>
<li><p>Find out when a value changes.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Find when a player changed teams</span>
-<span class="kw">filter</span>(players, teamID !=<span class="st"> </span><span class="kw">lag</span>(teamID))</code></pre></div></li>
+<span class="kw">filter</span>(players, teamID <span class="op">!=</span><span class="st"> </span><span class="kw">lag</span>(teamID))</code></pre></div></li>
</ul>
<p><code>lead()</code> and <code>lag()</code> have an optional argument <code>order_by</code>. If set, instead of using the row order to determine which value comes before another, they will use another variable. This important if you have not already sorted the data, or you want to sort one way and lag another.</p>
<p>Here’s a simple example of what happens if you don’t specify <code>order_by</code> when you need it:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">year =</span> <span class="dv">2000</span>:<span class="dv">2005</span>, <span class="dt">value =</span> (<span class="dv">0</span>:<span class="dv">5</span>) ^<span class="st"> </span><span class="dv">2</span>)
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">year =</span> <span class="dv">2000</span><span class="op">:</span><span class="dv">2005</span>, <span class="dt">value =</span> (<span class="dv">0</span><span class="op">:</span><span class="dv">5</span>) <span class="op">^</span><span class="st"> </span><span class="dv">2</span>)
scrambled <-<span class="st"> </span>df[<span class="kw">sample</span>(<span class="kw">nrow</span>(df)), ]
wrong <-<span class="st"> </span><span class="kw">mutate</span>(scrambled, <span class="dt">running =</span> <span class="kw">cumsum</span>(value))
@@ -174,10 +182,10 @@ wrong <-<span class="st"> </span><span class="kw">mutate</span>(scrambled, <s
<span class="co">#> year value running</span>
<span class="co">#> 1 2000 0 0</span>
<span class="co">#> 2 2001 1 55</span>
-<span class="co">#> 3 2002 4 29</span>
+<span class="co">#> 3 2002 4 20</span>
<span class="co">#> 4 2003 9 54</span>
-<span class="co">#> 5 2004 16 45</span>
-<span class="co">#> 6 2005 25 25</span>
+<span class="co">#> 5 2004 16 16</span>
+<span class="co">#> 6 2005 25 45</span>
right <-<span class="st"> </span><span class="kw">mutate</span>(scrambled, <span class="dt">running =</span> <span class="kw">order_by</span>(year, <span class="kw">cumsum</span>(value)))
<span class="kw">arrange</span>(right, year)
@@ -189,142 +197,48 @@ right <-<span class="st"> </span><span class="kw">mutate</span>(scrambled, <s
<span class="co">#> 5 2004 16 30</span>
<span class="co">#> 6 2005 25 55</span></code></pre></div>
</div>
-<div id="cumulative-aggregates" class="section level3">
-<h3>Cumulative aggregates</h3>
+<div id="cumulative-aggregates" class="section level2">
+<h2>Cumulative aggregates</h2>
<p>Base R provides cumulative sum (<code>cumsum()</code>), cumulative min (<code>cummin()</code>) and cumulative max (<code>cummax()</code>). (It also provides <code>cumprod()</code> but that is rarely useful). Other common accumulating functions are <code>cumany()</code> and <code>cumall()</code>, cumulative versions of <code>||</code> and <code>&&</code>, and <code>cummean()</code>, a cumulative mean. These are not included in base R, but efficient versions are provided by <cod [...]
<p><code>cumany()</code> and <code>cumall()</code> are useful for selecting all rows up to, or all rows after, a condition is true for the first (or last) time. For example, we can use <code>cumany()</code> to find all records for a player after they played a year with 150 games:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">filter</span>(players, <span class="kw">cumany</span>(G ><span class="st"> </span><span class="dv">150</span>))</code></pre></div>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">filter</span>(players, <span class="kw">cumany</span>(G <span class="op">></span><span class="st"> </span><span class="dv">150</span>))</code></pre></div>
<p>Like lead and lag, you may want to control the order in which the accumulation occurs. None of the built in functions have an <code>order_by</code> argument so <code>dplyr</code> provides a helper: <code>order_by()</code>. You give it the variable you want to order by, and then the call to the window function:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">x <-<span class="st"> </span><span class="dv">1</span>:<span class="dv">10</span>
-y <-<span class="st"> </span><span class="dv">10</span>:<span class="dv">1</span>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">x <-<span class="st"> </span><span class="dv">1</span><span class="op">:</span><span class="dv">10</span>
+y <-<span class="st"> </span><span class="dv">10</span><span class="op">:</span><span class="dv">1</span>
<span class="kw">order_by</span>(y, <span class="kw">cumsum</span>(x))
<span class="co">#> [1] 55 54 52 49 45 40 34 27 19 10</span></code></pre></div>
<p>This function uses a bit of non-standard evaluation, so I wouldn’t recommend using it inside another function; use the simpler but less concise <code>with_order()</code> instead.</p>
</div>
-<div id="recycled-aggregates" class="section level3">
-<h3>Recycled aggregates</h3>
+<div id="recycled-aggregates" class="section level2">
+<h2>Recycled aggregates</h2>
<p>R’s vector recycling make it easy to select values that are higher or lower than a summary. I call this a recycled aggregate because the value of the aggregate is recycled to be the same length as the original vector. Recycled aggregates are useful if you want to find all records greater than the mean or less than the median:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">filter</span>(players, G ><span class="st"> </span><span class="kw">mean</span>(G))
-<span class="kw">filter</span>(players, G <<span class="st"> </span><span class="kw">median</span>(G))</code></pre></div>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">filter</span>(players, G <span class="op">></span><span class="st"> </span><span class="kw">mean</span>(G))
+<span class="kw">filter</span>(players, G <span class="op"><</span><span class="st"> </span><span class="kw">median</span>(G))</code></pre></div>
<p>While most SQL databases don’t have an equivalent of <code>median()</code> or <code>quantile()</code>, when filtering you can achieve the same effect with <code>ntile()</code>. For example, <code>x > median(x)</code> is equivalent to <code>ntile(x, 2) == 2</code>; <code>x > quantile(x, 75)</code> is equivalent to <code>ntile(x, 100) > 75</code> or <code>ntile(x, 4) > 3</code>.</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">filter</span>(players, <span class="kw">ntile</span>(G, <span class="dv">2</span>) ==<span class="st"> </span><span class="dv">2</span>)</code></pre></div>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">filter</span>(players, <span class="kw">ntile</span>(G, <span class="dv">2</span>) <span class="op">==</span><span class="st"> </span><span class="dv">2</span>)</code></pre></div>
<p>You can also use this idea to select the records with the highest (<code>x == max(x)</code>) or lowest value (<code>x == min(x)</code>) for a field, but the ranking functions give you more control over ties, and allow you to select any number of records.</p>
<p>Recycled aggregates are also useful in conjunction with <code>mutate()</code>. For example, with the batting data, we could compute the “career year”, the number of years a player has played since they entered the league:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">mutate</span>(players, <span class="dt">career_year =</span> yearID -<span class="st"> </span><span class="kw">min</span>(yearID) +<span class="st"> </span><span class="dv">1</span>)
-<span class="co">#> # A tibble: 99,846 x 8</span>
-<span class="co">#> playerID yearID teamID G AB R H career_year</span>
-<span class="co">#> <chr> <int> <fctr> <int> <int> <int> <int> <dbl></span>
-<span class="co">#> 1 aardsda01 2004 SFN 11 0 0 0 1</span>
-<span class="co">#> 2 aardsda01 2006 CHN 45 2 0 0 3</span>
-<span class="co">#> 3 aardsda01 2007 CHA 25 0 0 0 4</span>
-<span class="co">#> 4 aardsda01 2008 BOS 47 1 0 0 5</span>
-<span class="co">#> ... with 99,842 more rows</span></code></pre></div>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">mutate</span>(players, <span class="dt">career_year =</span> yearID <span class="op">-</span><span class="st"> </span><span class="kw">min</span>(yearID) <span class="op">+</span><span class="st"> </span><span class="dv">1</span>)
+<span class="co">#> # A tibble: 19,113 x 8</span>
+<span class="co">#> # Groups: playerID [1,322]</span>
+<span class="co">#> playerID yearID teamID G AB R H career_year</span>
+<span class="co">#> <chr> <int> <fctr> <int> <int> <int> <int> <dbl></span>
+<span class="co">#> 1 aaronha01 1954 ML1 122 468 58 131 1.00</span>
+<span class="co">#> 2 aaronha01 1955 ML1 153 602 105 189 2.00</span>
+<span class="co">#> 3 aaronha01 1956 ML1 153 609 106 200 3.00</span>
+<span class="co">#> 4 aaronha01 1957 ML1 151 615 118 198 4.00</span>
+<span class="co">#> # ... with 19,109 more rows</span></code></pre></div>
<p>Or, as in the introductory example, we could compute a z-score:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">mutate</span>(players, <span class="dt">G_z =</span> (G -<span class="st"> </span><span class="kw">mean</span>(G)) /<span class="st"> </span><span class="kw">sd</span>(G))
-<span class="co">#> # A tibble: 99,846 x 8</span>
-<span class="co">#> playerID yearID teamID G AB R H G_z</span>
-<span class="co">#> <chr> <int> <fctr> <int> <int> <int> <int> <dbl></span>
-<span class="co">#> 1 aardsda01 2004 SFN 11 0 0 0 -1.1167685</span>
-<span class="co">#> 2 aardsda01 2006 CHN 45 2 0 0 0.3297126</span>
-<span class="co">#> 3 aardsda01 2007 CHA 25 0 0 0 -0.5211586</span>
-<span class="co">#> 4 aardsda01 2008 BOS 47 1 0 0 0.4147997</span>
-<span class="co">#> ... with 99,842 more rows</span></code></pre></div>
-</div>
-</div>
-<div id="window-functions-in-sql" class="section level2">
-<h2>Window functions in SQL</h2>
-<p>Window functions have a slightly different flavour in SQL. The syntax is a little different, and the cumulative, rolling and recycled aggregate functions are all based on the simple aggregate function. The goal in this section is not to tell you everything you need to know about window functions in SQL, but to remind you of the basics and show you how dplyr translates your R expressions in to SQL.</p>
-<div id="structure-of-a-window-function-in-sql" class="section level3">
-<h3>Structure of a window function in SQL</h3>
-<p>In SQL, window functions have the form <code>[expression] OVER ([partition clause] [order clause] [frame_clause])</code>:</p>
-<ul>
-<li><p>The <strong>expression</strong> is a combination of variable names and window functions. Support for window functions varies from database to database, but most support the ranking functions, <code>lead</code>, <code>lag</code>, <code>nth</code>, <code>first</code>, <code>last</code>, <code>count</code>, <code>min</code>, <code>max</code>, <code>sum</code>, <code>avg</code> and <code>stddev</code>. dplyr generates this from the R expression in your <code>mutate</code> or <code>fil [...]
-<li><p>The <strong>partition clause</strong> specifies how the window function is broken down over groups. It plays an analogous role to <code>GROUP BY</code> for aggregate functions, and <code>group_by()</code> in dplyr. It is possible for different window functions to be partitioned into different groups, but not all databases support it, and neither does dplyr.</p></li>
-<li><p>The <strong>order clause</strong> controls the ordering (when it makes a difference). This is important for the ranking functions since it specifies which variables to rank by, but it’s also needed for cumulative functions and lead. Whenever you’re thinking about before and after in SQL, you must always tell it which variable defines the order. In dplyr you do this with <code>arrange()</code>. If the order clause is missing when needed, some databases fail with an error message wh [...]
-<li><p>The <strong>frame clause</strong> defines which rows, or <strong>frame</strong>, that are passed to the window function, describing which rows (relative to the current row) should be included. The frame clause provides two offsets which determine the start and end of frame. There are three special values: -Inf means to include all preceeding rows (in SQL, “unbounded preceding”), 0 means the current row (“current row”), and Inf means all following rows (“unbounded following)”. The [...]
-<div class="figure">
-<img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAlgAAADoCAYAAAAzM5qAAAAMRmlDQ1BJQ0MgUHJvZmlsZQAASA2tV3dYU1cbf+9IAiEJIyECMsJeouwpewsKMoU6CEkgYcQQCCpua2kF60DFgaOiVRGrVisgdSDitihu6/hQi4JSiwMXKt+5YdinT7//vvs859zf/b3vee/vfe+55zkHQNtOqFDk4ToA+fIiZUJkiGBSWrqAcQ+0gAU81NyFokJFcHx8LPzP680NwCjjVScq1v90+3eDrlhSKALA4pE5U1woykf4FwCSI1IoiwBoLYi3nFGkoHAnwjwlEojwRwpnqzEdqQde5gC2UvskJYQC0L0BNFhCoTIbgBOGeEGxKBvF4YgRdpaLZXKEVyMcIJIKEce5hvCo/PzpCGsjCHaZf4uT/TcsFGYOxxQKs4fxQC7UUNAIkxUq8oSz1A//zy4/T4Xq [...]
-<p class="caption">A visual summary of frame options</p>
-</div>
-<p>Of the many possible specifications, there are only three that commonly used. They select between aggregation variants:</p>
-<ul>
-<li><p>Recycled: <code>BETWEEN UNBOUND PRECEEDING AND UNBOUND FOLLOWING</code></p></li>
-<li><p>Cumulative: <code>BETWEEN UNBOUND PRECEEDING AND CURRENT ROW</code></p></li>
-<li><p>Rolling: <code>BETWEEN 2 PRECEEDING AND 2 FOLLOWING</code></p></li>
-</ul>
-<p>dplyr generates the frame clause based on whether your using a recycled aggregate or a cumulative aggregate.</p></li>
-</ul>
-<p>It’s easiest to understand these specifications by looking at a few examples. Simple examples just need the partition and order clauses:</p>
-<ul>
-<li><p>Rank each year within a player by number of home runs: <code>RANK() OVER (PARTITION BY playerID ORDER BY desc(H))</code></p></li>
-<li><p>Compute change in number of games from one year to the next: <code>G - LAG(G) OVER (PARTITION G playerID ORDER BY yearID)</code></p></li>
-</ul>
-<p>Aggregate variants are more verbose because we also need to supply the frame clause:</p>
-<ul>
-<li><p>Running sum of G for each player: <code>SUM(G) OVER (PARTITION BY playerID ORDER BY yearID BETWEEN UNBOUND PRECEEDING AND CURRENT ROW)</code></p></li>
-<li><p>Compute the career year: <code>YearID - min(YearID) OVER (PARTITION BY playerID BETWEEN UNBOUND PRECEEDING AND UNBOUND FOLLOWING) + 1</code></p></li>
-<li><p>Compute a rolling average of games player: <code>MEAN(G) OVER (PARTITION BY playerID ORDER BY yearID BETWEEN 2 PRECEEDING AND 2 FOLLOWING)</code></p></li>
-</ul>
-<p>You’ll notice that window functions in SQL are more verbose than in R. This is because different window functions can have different partitions, and the frame specification is more general than the two aggregate variants (recycled and cumulative) provided by dplyr. dplyr makes a tradeoff: you can’t access rarely used window function capabilities (unless you write raw SQL), but in return, common operations are much more succinct.</p>
-</div>
-<div id="translating-dplyr-to-sql" class="section level3">
-<h3>Translating dplyr to SQL</h3>
-<p>To see how individual window functions are translated to SQL, we can use <code>translate_sql()</code> with the argument <code>window = TRUE</code>.</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">if (<span class="kw">has_lahman</span>(<span class="st">"postgres"</span>)) {
- players_db <-<span class="st"> </span><span class="kw">group_by</span>(<span class="kw">tbl</span>(<span class="kw">lahman_postgres</span>(), <span class="st">"Batting"</span>), playerID)
-
- <span class="kw">print</span>(<span class="kw">translate_sql</span>(<span class="kw">mean</span>(G), <span class="dt">tbl =</span> players_db, <span class="dt">window =</span> <span class="ot">TRUE</span>))
- <span class="kw">print</span>(<span class="kw">translate_sql</span>(<span class="kw">cummean</span>(G), <span class="dt">tbl =</span> players_db, <span class="dt">window =</span> <span class="ot">TRUE</span>))
- <span class="kw">print</span>(<span class="kw">translate_sql</span>(<span class="kw">rank</span>(G), <span class="dt">tbl =</span> players_db, <span class="dt">window =</span> <span class="ot">TRUE</span>))
- <span class="kw">print</span>(<span class="kw">translate_sql</span>(<span class="kw">ntile</span>(G, <span class="dv">2</span>), <span class="dt">tbl =</span> players_db, <span class="dt">window =</span> <span class="ot">TRUE</span>))
- <span class="kw">print</span>(<span class="kw">translate_sql</span>(<span class="kw">lag</span>(G), <span class="dt">tbl =</span> players_db, <span class="dt">window =</span> <span class="ot">TRUE</span>))
-}</code></pre></div>
-<p>If the tbl has been arranged previously, then that ordering will be used for the order clause:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">if (<span class="kw">has_lahman</span>(<span class="st">"postgres"</span>)) {
- players_by_year <-<span class="st"> </span><span class="kw">arrange</span>(players_db, yearID)
- <span class="kw">print</span>(<span class="kw">translate_sql</span>(<span class="kw">cummean</span>(G), <span class="dt">tbl =</span> players_by_year, <span class="dt">window =</span> <span class="ot">TRUE</span>))
- <span class="kw">print</span>(<span class="kw">translate_sql</span>(<span class="kw">rank</span>(), <span class="dt">tbl =</span> players_by_year, <span class="dt">window =</span> <span class="ot">TRUE</span>))
- <span class="kw">print</span>(<span class="kw">translate_sql</span>(<span class="kw">lag</span>(G), <span class="dt">tbl =</span> players_by_year, <span class="dt">window =</span> <span class="ot">TRUE</span>))
-}</code></pre></div>
-<p>There are some challenges when translating window functions between R and SQL, because dplyr tries to keep the window functions as similar as possible to both the existing R analogues and to the SQL functions. This means that there are three ways to control the order clause depending on which window function you’re using:</p>
-<ul>
-<li><p>For ranking functions, the ordering variable is the first argument: <code>rank(x)</code>, <code>ntile(y, 2)</code>. If omitted or <code>NULL</code>, will use the default ordering associated with the tbl (as set by <code>arrange()</code>).</p></li>
-<li><p>Accumulating aggegates only take a single argument (the vector to aggregate). To control ordering, use <code>order_by()</code>.</p></li>
-<li><p>Aggregates implemented in dplyr (<code>lead</code>, <code>lag</code>, <code>nth_value</code>, <code>first_value</code>, <code>last_value</code>) have an <code>order_by</code> argument. Supply it to override the default ordering.</p></li>
-</ul>
-<p>The three options are illustrated in the snippet below:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">mutate</span>(players,
- <span class="kw">min_rank</span>(yearID),
- <span class="kw">order_by</span>(yearID, <span class="kw">cumsum</span>(G)),
- <span class="kw">lead</span>(<span class="dt">order_by =</span> yearID, G)
-)</code></pre></div>
-<p>Currently there is no way to order by multiple variables, except by setting the default ordering with <code>arrange()</code>. This will be added in a future release.</p>
-</div>
-<div id="translating-filters-based-on-window-functions" class="section level3">
-<h3>Translating filters based on window functions</h3>
-<p>There are some restrictions on window functions in SQL that make their use with <code>WHERE</code> somewhat challenging. Take this simple example, where we want to find the year each player played the most games:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">filter</span>(players, <span class="kw">rank</span>(G) ==<span class="st"> </span><span class="dv">1</span>)</code></pre></div>
-<p>The following straightforward translation does not work because window functions are only allowed in <code>SELECT</code> and <code>ORDER_BY</code>.</p>
-<pre><code>SELECT *
-FROM Batting
-WHERE rank() OVER (PARTITION BY "playerID" ORDER BY "G") = 1;</code></pre>
-<p>Computing the window function in <code>SELECT</code> and referring to it in <code>WHERE</code> or <code>HAVING</code> doesn’t work either, because <code>WHERE</code> and <code>HAVING</code> are computed before windowing functions.</p>
-<pre><code>SELECT *, rank() OVER (PARTITION BY "playerID" ORDER BY "G") as rank
-FROM Batting
-WHERE rank = 1;
-
-SELECT *, rank() OVER (PARTITION BY "playerID" ORDER BY "G") as rank
-FROM Batting
-HAVING rank = 1;</code></pre>
-<p>Instead, we must use a subquery:</p>
-<pre><code>SELECT *
-FROM (
- SELECT *, rank() OVER (PARTITION BY "playerID" ORDER BY "G") as rank
- FROM Batting
-) tmp
-WHERE rank = 1;</code></pre>
-<p>And even that query is a slightly simplification because it will also add a rank column to the original columns. dplyr takes care of generating the full, verbose, query, so you can focus on your data analysis challenges.</p>
-</div>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">mutate</span>(players, <span class="dt">G_z =</span> (G <span class="op">-</span><span class="st"> </span><span class="kw">mean</span>(G)) <span class="op">/</span><span class="st"> </span><span class="kw">sd</span>(G))
+<span class="co">#> # A tibble: 19,113 x 8</span>
+<span class="co">#> # Groups: playerID [1,322]</span>
+<span class="co">#> playerID yearID teamID G AB R H G_z</span>
+<span class="co">#> <chr> <int> <fctr> <int> <int> <int> <int> <dbl></span>
+<span class="co">#> 1 aaronha01 1954 ML1 122 468 58 131 -1.16 </span>
+<span class="co">#> 2 aaronha01 1955 ML1 153 602 105 189 0.519</span>
+<span class="co">#> 3 aaronha01 1956 ML1 153 609 106 200 0.519</span>
+<span class="co">#> 4 aaronha01 1957 ML1 151 615 118 198 0.411</span>
+<span class="co">#> # ... with 19,109 more rows</span></code></pre></div>
</div>
@@ -334,7 +248,7 @@ WHERE rank = 1;</code></pre>
(function () {
var script = document.createElement("script");
script.type = "text/javascript";
- script.src = "https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
+ script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
document.getElementsByTagName("head")[0].appendChild(script);
})();
</script>
diff --git a/inst/include/dplyr.h b/inst/include/dplyr.h
index 864364d..1c1c6f0 100644
--- a/inst/include/dplyr.h
+++ b/inst/include/dplyr.h
@@ -1,208 +1,10 @@
#ifndef dplyr_dplyr_H
#define dplyr_dplyr_H
-#include <Rcpp.h>
-#include <dplyr/workarounds/static_assert.h>
-#include <solaris/solaris.h>
-#include <dplyr/config.h>
-#include <dplyr/workarounds.h>
-
-using namespace Rcpp ;
-#include <tools/all_na.h>
-// borrowed from Rcpp11
-#ifndef RCPP_DEBUG_OBJECT
- #define RCPP_DEBUG_OBJECT(OBJ) Rf_PrintValue( Rf_eval( Rf_lang2( Rf_install( "str"), OBJ ), R_GlobalEnv ) ) ;
-#endif
-
-#ifndef RCPP_INSPECT_OBJECT
- #define RCPP_INSPECT_OBJECT(OBJ) Rf_PrintValue( Rf_eval( Rf_lang2( Rf_install( ".Internal"), Rf_lang2( Rf_install( "inspect" ), OBJ ) ), R_GlobalEnv ) ) ;
-#endif
-
-#include <boost/scoped_ptr.hpp>
-#include <boost/shared_ptr.hpp>
-#include <boost/functional/hash.hpp>
-
-#ifndef dplyr_hash_map
- #if defined(_WIN32)
- #define dplyr_hash_map RCPP_UNORDERED_MAP
- #else
- #include <boost/unordered_map.hpp>
- #define dplyr_hash_map boost::unordered_map
- #endif
-#endif
-
-#ifndef dplyr_hash_set
- #if defined(_WIN32)
- #define dplyr_hash_set RCPP_UNORDERED_SET
- #else
- #include <boost/unordered_set.hpp>
- #define dplyr_hash_set boost::unordered_set
- #endif
-#endif
+#include <dplyr/main.h>
#include <tools/tools.h>
-namespace dplyr {
- class LazySubsets ;
- Symbol extract_column( SEXP, const Environment& ) ;
- Symbol get_column(SEXP, const Environment&, const LazySubsets& ) ;
- class Result ;
- class ResultSet ;
- class Reducer_Proxy ;
- class DataFrameVisitors ;
- class DataFrameJoinVisitors ;
- std::string get_single_class(SEXP x) ;
-
- void strip_index(DataFrame x) ;
- template <typename Index>
- DataFrame subset( DataFrame df, const Index& indices, CharacterVector classes) ;
- void check_attribute_compatibility( SEXP left, SEXP right) ;
- bool same_levels( SEXP left, SEXP right ) ;
-}
-dplyr::Result* get_handler( SEXP, const dplyr::LazySubsets&, const Environment& ) ;
-dplyr::Result* nth_prototype( SEXP call, const dplyr::LazySubsets& subsets, int nargs) ;
-dplyr::Result* first_prototype( SEXP call, const dplyr::LazySubsets& subsets, int nargs) ;
-dplyr::Result* last_prototype( SEXP call, const dplyr::LazySubsets& subsets, int nargs) ;
-bool argmatch( const std::string& target, const std::string& s) ;
-
-bool can_simplify(SEXP) ;
-
-void assert_all_white_list(const DataFrame&) ;
-inline SEXP shared_SEXP(SEXP x){
- SET_NAMED(x, 2 );
- return x ;
-}
-
-inline SEXP pairlist_shallow_copy(SEXP p){
- Shield<SEXP> attr( Rf_cons(CAR(p), R_NilValue) ) ;
- SEXP q = attr ;
- SET_TAG(q, TAG(p)) ;
- p = CDR(p) ;
- while( !Rf_isNull(p) ){
- Shield<SEXP> s( Rf_cons(CAR(p), R_NilValue) ) ;
- SETCDR(q, s) ;
- q = CDR(q) ;
- SET_TAG(q, TAG(p)) ;
- p = CDR(p) ;
- }
- return attr ;
-}
-
-inline void copy_attributes(SEXP out, SEXP data){
- SEXP att = ATTRIB(data) ;
- if( !Rf_isNull(att) ){
- SET_ATTRIB( out, pairlist_shallow_copy(ATTRIB(data)) ) ;
- }
- SET_OBJECT( out, OBJECT(data) );
- if( IS_S4_OBJECT(data) ) SET_S4_OBJECT(out) ;
-}
-
-// same as copy_attributes but without names
-inline void copy_most_attributes(SEXP out, SEXP data){
- copy_attributes(out,data) ;
- Rf_setAttrib( out, R_NamesSymbol, R_NilValue ) ;
-}
-
-CharacterVector dfloc(List) ;
-SEXP shallow_copy(const List& data) ;
-
-typedef dplyr::Result* (*HybridHandler)(SEXP, const dplyr::LazySubsets&, int) ;
-
-#if defined(COMPILING_DPLYR)
- DataFrame build_index_cpp( DataFrame data ) ;
- void registerHybridHandler( const char* , HybridHandler ) ;
- SEXP get_time_classes() ;
- SEXP get_date_classes() ;
-#endif
-
-#include <dplyr/registration.h>
-
-#include <dplyr/DataFrameAble.h>
-#include <dplyr/CharacterVectorOrderer.h>
-#include <dplyr/white_list.h>
-#include <dplyr/check_supported_type.h>
-#include <dplyr/visitor_set/visitor_set.h>
-#include <dplyr/DataFrameVisitorsIndexSet.h>
-#include <dplyr/DataFrameVisitorsIndexMap.h>
-#include <dplyr/BoolResult.h>
-
-#include <dplyr/EmptySubset.h>
-#include <dplyr/FullDataFrame.h>
-#include <dplyr/GroupedDataFrame.h>
-#include <dplyr/RowwiseDataFrame.h>
-#include <dplyr/tbl_cpp.h>
-#include <dplyr/comparisons.h>
-#include <dplyr/comparisons_different.h>
-#include <dplyr/VectorVisitor.h>
-#include <dplyr/SubsetVectorVisitor.h>
-#include <dplyr/OrderVisitor.h>
-#include <dplyr/VectorVisitorImpl.h>
-#include <dplyr/SubsetVectorVisitorImpl.h>
-#include <dplyr/DataFrameVisitors.h>
-#include <dplyr/MultipleVectorVisitors.h>
-#include <dplyr/DataFrameSubsetVisitors.h>
-#include <dplyr/DataFrameColumnSubsetVisitor.h>
-#include <dplyr/MatrixColumnSubsetVectorVisitor.h>
-#include <dplyr/MatrixColumnVisitor.h>
-#include <dplyr/DataFrameColumnVisitor.h>
-#include <dplyr/subset_visitor.h>
-#include <dplyr/visitor.h>
-#include <dplyr/OrderVisitorImpl.h>
-#include <dplyr/JoinVisitor.h>
-#include <dplyr/JoinVisitorImpl.h>
-#include <dplyr/DataFrameJoinVisitors.h>
-#include <dplyr/Order.h>
-#include <dplyr/SummarisedVariable.h>
-#include <dplyr/Result/all.h>
-#include <dplyr/vector_class.h>
-#include <dplyr/Gatherer.h>
-#include <dplyr/Replicator.h>
-#include <dplyr/Collecter.h>
-#include <dplyr/NamedListAccumulator.h>
-#include <dplyr/train.h>
-
-void check_not_groups(const CharacterVector& result_names, const GroupedDataFrame& gdf) ;
-void check_not_groups(const CharacterVector& result_names, const RowwiseDataFrame& gdf) ;
-
-void check_not_groups(const LazyDots& dots, const GroupedDataFrame& gdf) ;
-void check_not_groups(const LazyDots& dots, const RowwiseDataFrame& gdf) ;
-
-template <typename Data>
-SEXP strip_group_attributes(Data df){
- Shield<SEXP> attribs( Rf_cons( dplyr::classes_not_grouped(), R_NilValue ) ) ;
- SET_TAG(attribs, Rf_install("class") ) ;
-
- SEXP p = ATTRIB(df) ;
- std::vector<SEXP> black_list(8) ;
- black_list[0] = Rf_install("indices") ;
- black_list[1] = Rf_install("vars") ;
- black_list[2] = Rf_install("index") ;
- black_list[3] = Rf_install("labels") ;
- black_list[4] = Rf_install("drop") ;
- black_list[5] = Rf_install("group_sizes") ;
- black_list[6] = Rf_install("biggest_group_size") ;
- black_list[7] = Rf_install("class") ;
-
- SEXP q = attribs ;
- while( ! Rf_isNull(p) ){
- SEXP tag = TAG(p) ;
- if( std::find( black_list.begin(), black_list.end(), tag ) == black_list.end() ){
- Shield<SEXP> s( Rf_cons( CAR(p), R_NilValue) ) ;
- SETCDR(q,s) ;
- q = CDR(q) ;
- SET_TAG(q, tag) ;
- }
-
- p = CDR(p) ;
- }
- return attribs ;
-}
-
-template <typename T>
-CharacterVector names( const T& obj ){
- SEXP x = obj ;
- return Rf_getAttrib(x, Rf_install("names" ) ) ;
-}
-
+#include <dplyr/dplyr.h>
#endif
diff --git a/inst/include/dplyr/BoolResult.h b/inst/include/dplyr/BoolResult.h
index 0497bb8..fc823af 100644
--- a/inst/include/dplyr/BoolResult.h
+++ b/inst/include/dplyr/BoolResult.h
@@ -1,43 +1,55 @@
#ifndef dplyr_tools_BoolResult_H
#define dplyr_tools_BoolResult_H
-namespace dplyr{
-
- class BoolResult {
- public:
- BoolResult(bool result_) : result(result_){}
- BoolResult(bool result_, const std::string& msg) : result(result_), message(msg){}
-
- void set_true(){ result = true ; message.clear() ; }
- void set_false( const char* msg ){ result = false; message = msg ; }
-
- inline operator SEXP() const {
- LogicalVector res = LogicalVector::create( result ) ;
- res.attr("comment") = message ;
- res.attr("class") = "BoolResult" ;
- return res;
- }
-
- inline operator bool() const {
- return result ;
- }
-
- inline const std::string& why_not() const {
- return message ;
- }
-
- private:
- bool result ;
- std::string message ;
- } ;
-
- inline BoolResult no_because( const std::string& msg ){
- return BoolResult( false, msg );
+#include <tools/utils.h>
+
+namespace dplyr {
+
+class BoolResult {
+public:
+ BoolResult(bool result_) : result(result_) {}
+ BoolResult(bool result_, const CharacterVector& msg) : result(result_), message(msg) {}
+
+ inline operator SEXP() const {
+ LogicalVector res = LogicalVector::create(result);
+ res.attr("comment") = message;
+ set_class(res, "BoolResult");
+ return res;
+ }
+
+ inline operator bool() const {
+ return result;
+ }
+
+ inline std::string why_not() const {
+ R_xlen_t n = message.length();
+ if (n == 0)
+ return "";
+
+ if (n == 1)
+ return std::string(message[0]);
+
+ std::stringstream ss;
+ ss << "\n";
+ for (int i = 0; i < n; ++i) {
+ ss << "- " << std::string(message[i]) << "\n";
}
- inline BoolResult yes(){
- return true ;
- }
+ return ss.str();
+ }
+
+private:
+ bool result;
+ CharacterVector message;
+};
+
+inline BoolResult no_because(const CharacterVector& msg) {
+ return BoolResult(false, msg);
+}
+
+inline BoolResult yes() {
+ return true;
+}
}
diff --git a/inst/include/dplyr/CharacterVectorOrderer.h b/inst/include/dplyr/CharacterVectorOrderer.h
index 5bee91d..d6f883c 100644
--- a/inst/include/dplyr/CharacterVectorOrderer.h
+++ b/inst/include/dplyr/CharacterVectorOrderer.h
@@ -1,22 +1,22 @@
#ifndef dplyr_CharacterVectorOrderer_H
#define dplyr_CharacterVectorOrderer_H
+#include <tools/hash.h>
+
namespace dplyr {
- class CharacterVectorOrderer {
- public:
+class CharacterVectorOrderer {
+public:
- CharacterVectorOrderer( const CharacterVector& data_ ) ;
+ CharacterVectorOrderer(const CharacterVector& data_);
- inline IntegerVector get() const {
- return orders ;
- }
+ inline IntegerVector get() const {
+ return orders;
+ }
- private:
- CharacterVector data ;
- dplyr_hash_set<SEXP> set ;
- IntegerVector orders ;
- } ;
+private:
+ IntegerVector orders;
+};
}
diff --git a/inst/include/dplyr/Collecter.h b/inst/include/dplyr/Collecter.h
index 219c088..0839e15 100644
--- a/inst/include/dplyr/Collecter.h
+++ b/inst/include/dplyr/Collecter.h
@@ -1,419 +1,672 @@
#ifndef dplyr_Collecter_H
#define dplyr_Collecter_H
-namespace dplyr {
+#include <tools/all_na.h>
+#include <tools/hash.h>
- class Collecter {
- public:
- virtual ~Collecter(){} ;
- virtual void collect( const SlicingIndex& index, SEXP v ) = 0 ;
- virtual SEXP get() = 0 ;
- virtual bool compatible(SEXP) = 0 ;
- virtual bool can_promote(SEXP) const = 0 ;
- virtual bool is_factor_collecter() const{
- return false ;
- }
- virtual bool is_logical_all_na() const {
- return false ;
- }
- virtual std::string describe() const = 0 ;
- } ;
-
- template <int RTYPE>
- class Collecter_Impl : public Collecter {
- public:
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-
- Collecter_Impl( int n_ ): data( n_, Rcpp::traits::get_na<RTYPE>() ){}
-
- void collect( const SlicingIndex& index, SEXP v ){
- Vector<RTYPE> source(v) ;
- STORAGE* source_ptr = Rcpp::internal::r_vector_start<RTYPE>(source) ;
- for( int i=0; i<index.size(); i++){
- data[index[i]] = source_ptr[i] ;
- }
- }
+#include <dplyr/registration.h>
+#include <dplyr/vector_class.h>
- inline SEXP get(){
- return data ;
- }
-
- inline bool compatible(SEXP x) {
- return RTYPE == TYPEOF(x) ;
- }
-
- bool can_promote(SEXP x) const {
- return false ;
- }
-
- std::string describe() const {
- return vector_class<RTYPE>() ;
- }
-
- bool is_logical_all_na() const {
- return RTYPE == LGLSXP && all(is_na(data)).is_true() ;
- }
-
- protected:
- Vector<RTYPE> data ;
- } ;
-
- template <>
- class Collecter_Impl<REALSXP> : public Collecter {
- public:
- Collecter_Impl( int n_ ): data( n_, NA_REAL ){}
-
- void collect( const SlicingIndex& index, SEXP v ){
- NumericVector source(v) ;
- double* source_ptr = source.begin() ;
- for( int i=0; i<index.size(); i++){
- data[index[i]] = source_ptr[i] ;
- }
- }
-
- inline SEXP get(){
- return data ;
- }
-
- inline bool compatible(SEXP x) {
- int RTYPE = TYPEOF(x) ;
- return RTYPE == REALSXP || ( RTYPE == INTSXP && !Rf_inherits(x, "factor") ) || RTYPE == LGLSXP ;
- }
-
- bool can_promote(SEXP x) const {
- return false ;
- }
-
- std::string describe() const {
- return "numeric" ;
- }
-
- protected:
- NumericVector data ;
-
- } ;
-
- template <>
- class Collecter_Impl<STRSXP> : public Collecter {
- public:
- Collecter_Impl( int n_ ): data( n_, NA_STRING ){}
-
- void collect( const SlicingIndex& index, SEXP v ){
- if( TYPEOF(v) == STRSXP ){
- collect_strings(index, v) ;
- } else if( Rf_inherits( v, "factor" ) ){
- collect_factor(index, v) ;
- } else {
- CharacterVector vec(v) ;
- collect_strings(index, vec) ;
- }
- }
-
- inline SEXP get(){
- return data ;
- }
-
- inline bool compatible(SEXP x) {
- return ( STRSXP == TYPEOF(x) ) || Rf_inherits( x, "factor" ) ;
- }
-
- bool can_promote(SEXP x) const {
- return false ;
- }
-
- std::string describe() const {
- return "character" ;
- }
-
- protected:
- CharacterVector data ;
-
- private:
-
- void collect_strings( const SlicingIndex& index, CharacterVector source){
- SEXP* p_source = Rcpp::internal::r_vector_start<STRSXP>(source) ;
- SEXP* p_data = Rcpp::internal::r_vector_start<STRSXP>(data) ;
- int n = index.size() ;
- for( int i=0; i<n; i++){
- p_data[index[i]] = p_source[i] ;
- }
- }
-
- void collect_factor( const SlicingIndex& index, IntegerVector source ){
- CharacterVector levels = source.attr("levels") ;
- for( int i=0; i<index.size(); i++){
- if( source[i] == NA_INTEGER ) {
- data[index[i]] = NA_STRING ;
- } else{
- data[index[i]] = levels[source[i]-1] ;
- }
- }
- }
-
- } ;
-
- template <>
- class Collecter_Impl<INTSXP> : public Collecter {
- public:
- Collecter_Impl( int n_ ): data( n_, NA_INTEGER ){}
-
- void collect( const SlicingIndex& index, SEXP v ){
- IntegerVector source(v) ;
- int* source_ptr = source.begin() ;
- for( int i=0; i<index.size(); i++){
- data[index[i]] = source_ptr[i] ;
- }
- }
+namespace dplyr {
- inline SEXP get(){
- return data ;
- }
+static inline bool inherits_from(SEXP x, const std::set<std::string>& classes) {
+ std::vector<std::string> x_classes, inherited_classes;
+ if (!OBJECT(x)) {
+ return false;
+ }
+ x_classes = Rcpp::as< std::vector<std::string> >(Rf_getAttrib(x, R_ClassSymbol));
+ std::sort(x_classes.begin(), x_classes.end());
+ std::set_intersection(x_classes.begin(), x_classes.end(),
+ classes.begin(), classes.end(),
+ std::back_inserter(inherited_classes));
+ return !inherited_classes.empty();
+}
- inline bool compatible(SEXP x) {
- int RTYPE = TYPEOF(x) ;
- return ( INTSXP == RTYPE || RTYPE == LGLSXP ) && !Rf_inherits( x, "factor" ) ;
- }
+static bool is_class_known(SEXP x) {
+ static std::set<std::string> known_classes;
+ if (known_classes.empty()) {
+ known_classes.insert("hms");
+ known_classes.insert("difftime");
+ known_classes.insert("POSIXct");
+ known_classes.insert("factor");
+ known_classes.insert("Date");
+ known_classes.insert("AsIs");
+ known_classes.insert("integer64");
+ known_classes.insert("table");
+ }
+ if (OBJECT(x)) {
+ return inherits_from(x, known_classes);
+ } else {
+ return true;
+ }
+}
- bool can_promote(SEXP x) const {
- return TYPEOF(x) == REALSXP ;
- }
+static inline void warn_loss_attr(SEXP x) {
+ /* Attributes are lost with unknown classes */
+ if (!is_class_known(x)) {
+ SEXP classes = Rf_getAttrib(x, R_ClassSymbol);
+ Rf_warning("Vectorizing '%s' elements may not preserve their attributes",
+ CHAR(STRING_ELT(classes, 0)));
+ }
+}
- std::string describe() const {
- return "integer" ;
- }
+static inline bool all_logical_na(SEXP x, SEXPTYPE xtype) {
+ return LGLSXP == xtype && all_na(x);
+}
- protected:
- IntegerVector data ;
+class Collecter {
+public:
+ virtual ~Collecter() {};
+ virtual void collect(const SlicingIndex& index, SEXP v, int offset = 0) = 0;
+ virtual SEXP get() = 0;
+ virtual bool compatible(SEXP) = 0;
+ virtual bool can_promote(SEXP) const = 0;
+ virtual bool is_factor_collecter() const {
+ return false;
+ }
+ virtual bool is_logical_all_na() const {
+ return false;
+ }
+ virtual std::string describe() const = 0;
+};
+
+template <int RTYPE>
+class Collecter_Impl : public Collecter {
+public:
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+
+ Collecter_Impl(int n_): data(n_, Rcpp::traits::get_na<RTYPE>()) {}
+
+ void collect(const SlicingIndex& index, SEXP v, int offset = 0) {
+ if (all_logical_na(v, TYPEOF(v))) {
+ collect_logicalNA(index);
+ } else {
+ collect_sexp(index, v, offset);
+ }
+ }
- } ;
+ inline SEXP get() {
+ return data;
+ }
- template <int RTYPE>
- class TypedCollecter : public Collecter_Impl<RTYPE>{
- public:
- TypedCollecter( int n, SEXP types_) :
- Collecter_Impl<RTYPE>(n), types(types_){}
+ inline bool compatible(SEXP x) {
+ return RTYPE == TYPEOF(x) || all_logical_na(x, TYPEOF(x));
+ }
- inline SEXP get(){
- Collecter_Impl<RTYPE>::data.attr("class") = types ;
- return Collecter_Impl<RTYPE>::data ;
- }
+ bool can_promote(SEXP) const {
+ return false;
+ }
- inline bool compatible(SEXP x) {
- String type = STRING_ELT(types,0) ;
- return Rf_inherits(x, type.get_cstring() ) ;
- }
+ std::string describe() const {
+ return vector_class<RTYPE>();
+ }
- inline bool can_promote(SEXP x) const {
- return false ;
- }
+ bool is_logical_all_na() const {
+ return all_logical_na(data, RTYPE);
+ }
- std::string describe() const {
- return collapse<STRSXP>(types) ;
- }
+protected:
+ Vector<RTYPE> data;
- private:
- SEXP types ;
- } ;
+private:
+ void collect_logicalNA(const SlicingIndex& index) {
+ for (int i = 0; i < index.size(); i++) {
+ data[index[i]] = Rcpp::traits::get_na<RTYPE>();
+ }
+ }
+
+ void collect_sexp(const SlicingIndex& index, SEXP v, int offset = 0) {
+ warn_loss_attr(v);
+ Vector<RTYPE> source(v);
+ STORAGE* source_ptr = Rcpp::internal::r_vector_start<RTYPE>(source);
+ source_ptr = source_ptr + offset;
+ for (int i = 0; i < index.size(); i++) {
+ data[index[i]] = source_ptr[i];
+ }
+ }
- class POSIXctCollecter : public Collecter_Impl<REALSXP>{
- public:
- typedef Collecter_Impl<REALSXP> Parent ;
+};
- POSIXctCollecter( int n, SEXP tz_) :
- Parent(n), tz(tz_){}
+template <>
+class Collecter_Impl<REALSXP> : public Collecter {
+public:
+ Collecter_Impl(int n_): data(n_, NA_REAL) {}
- void collect( const SlicingIndex& index, SEXP v ){
- Parent::collect(index, v) ;
- update_tz(v) ;
- }
+ void collect(const SlicingIndex& index, SEXP v, int offset = 0) {
+ warn_loss_attr(v);
+ NumericVector source(v);
+ double* source_ptr = source.begin() + offset;
+ for (int i = 0; i < index.size(); i++) {
+ data[index[i]] = source_ptr[i];
+ }
+ }
+
+ inline SEXP get() {
+ return data;
+ }
+
+ inline bool compatible(SEXP x) {
+ int RTYPE = TYPEOF(x);
+ return (RTYPE == REALSXP && !Rf_inherits(x, "POSIXct") && !Rf_inherits(x, "Date")) ||
+ (RTYPE == INTSXP && !Rf_inherits(x, "factor")) ||
+ all_logical_na(x, RTYPE);
+ }
+
+ bool can_promote(SEXP) const {
+ return false;
+ }
+
+ std::string describe() const {
+ return "numeric";
+ }
+
+protected:
+ NumericVector data;
+
+};
+
+template <>
+class Collecter_Impl<STRSXP> : public Collecter {
+public:
+ Collecter_Impl(int n_): data(n_, NA_STRING) {}
+
+ void collect(const SlicingIndex& index, SEXP v, int offset = 0) {
+ warn_loss_attr(v);
+ if (TYPEOF(v) == STRSXP) {
+ collect_strings(index, v, offset);
+ } else if (Rf_inherits(v, "factor")) {
+ collect_factor(index, v, offset);
+ } else if (all_logical_na(v, TYPEOF(v))) {
+ collect_logicalNA(index, v);
+ } else {
+ CharacterVector vec(v);
+ collect_strings(index, vec, offset);
+ }
+ }
- inline SEXP get(){
- Parent::data.attr("class") = get_time_classes() ;
- if( !tz.isNULL() ){
- Parent::data.attr("tzone") = tz ;
- }
- return Parent::data ;
- }
+ inline SEXP get() {
+ return data;
+ }
- inline bool compatible(SEXP x) {
- return Rf_inherits(x, "POSIXct") ;
- }
+ inline bool compatible(SEXP x) {
+ return (STRSXP == TYPEOF(x)) || Rf_inherits(x, "factor") || all_logical_na(x, TYPEOF(x));
+ }
- inline bool can_promote(SEXP x) const {
- return false ;
- }
+ bool can_promote(SEXP) const {
+ return false;
+ }
- std::string describe() const {
- return collapse<STRSXP>(get_time_classes()) ;
- }
+ std::string describe() const {
+ return "character";
+ }
- private:
- RObject tz ;
-
- void update_tz(SEXP v){
- RObject v_tz( Rf_getAttrib(v, Rf_install("tzone")) );
- // if the new tz is NULL, keep previous value
- if( v_tz.isNULL() ) return ;
-
- if( tz.isNULL() ){
- // if current tz is NULL, grab the new one
- tz = v_tz ;
- } else {
- // none are NULL, so compare them
- // if they are equal, fine
- if( STRING_ELT(tz, 0) == STRING_ELT(v_tz,0) ) return ;
-
- // otherwise, settle to UTC
- tz = wrap( "UTC") ;
- }
- }
+protected:
+ CharacterVector data;
- } ;
+private:
- class FactorCollecter : public Collecter {
- public:
- typedef dplyr_hash_map<SEXP,int> LevelsMap ;
+ void collect_logicalNA(const SlicingIndex& index, LogicalVector) {
+ int n = index.size();
+ for (int i = 0; i < n; i++) {
+ SET_STRING_ELT(data, index[i], NA_STRING);
+ }
+ }
+
+ void collect_strings(const SlicingIndex& index, CharacterVector source,
+ int offset = 0) {
+ SEXP* p_source = Rcpp::internal::r_vector_start<STRSXP>(source) + offset;
+ int n = index.size();
+ for (int i = 0; i < n; i++) {
+ SET_STRING_ELT(data, index[i], p_source[i]);
+ }
+ }
+
+ void collect_factor(const SlicingIndex& index, IntegerVector source,
+ int offset = 0) {
+ CharacterVector levels = get_levels(source);
+ Rf_warning("binding character and factor vector, coercing into character vector");
+ for (int i = 0; i < index.size(); i++) {
+ if (source[i] == NA_INTEGER) {
+ data[index[i]] = NA_STRING;
+ } else {
+ data[index[i]] = levels[source[i + offset] - 1];
+ }
+ }
+ }
- FactorCollecter( int n, SEXP model_):
- data(n, IntegerVector::get_na() ),
- model(model_),
- levels( Rf_getAttrib(model, Rf_install("levels")) ),
- levels_map()
- {
- int nlevels = levels.size() ;
- for( int i=0; i<nlevels; i++) levels_map[ levels[i] ] = i + 1;
- }
+};
- bool is_factor_collecter() const{
- return true ;
- }
+template <>
+class Collecter_Impl<INTSXP> : public Collecter {
+public:
+ Collecter_Impl(int n_): data(n_, NA_INTEGER) {}
- void collect( const SlicingIndex& index, SEXP v ){
- // here we can assume that v is a factor with the right levels
- // we however do not assume that they are in the same order
- IntegerVector source(v) ;
- CharacterVector levels = source.attr( "levels" ) ;
-
- SEXP* levels_ptr = Rcpp::internal::r_vector_start<STRSXP>(levels) ;
- int* source_ptr = Rcpp::internal::r_vector_start<INTSXP>(source) ;
- for( int i=0; i<index.size(); i++){
- if( source_ptr[i] == NA_INTEGER ){
- data[ index[i] ] = NA_INTEGER ;
- } else {
- SEXP x = levels_ptr[ source_ptr[i] - 1 ] ;
- data[ index[i] ] = levels_map.find(x)->second ;
- }
- }
- }
+ void collect(const SlicingIndex& index, SEXP v, int offset = 0) {
+ warn_loss_attr(v);
+ IntegerVector source(v);
+ int* source_ptr = source.begin() + offset;
+ for (int i = 0; i < index.size(); i++) {
+ data[index[i]] = source_ptr[i];
+ }
+ }
+
+ inline SEXP get() {
+ return data;
+ }
+
+ inline bool compatible(SEXP x) {
+ int RTYPE = TYPEOF(x);
+ return ((INTSXP == RTYPE) && !Rf_inherits(x, "factor")) || all_logical_na(x, RTYPE);
+ }
+
+ bool can_promote(SEXP x) const {
+ return TYPEOF(x) == REALSXP && !Rf_inherits(x, "POSIXct") && !Rf_inherits(x, "Date");
+ }
+
+ std::string describe() const {
+ return "integer";
+ }
+
+protected:
+ IntegerVector data;
+
+};
+
+template <int RTYPE>
+class TypedCollecter : public Collecter_Impl<RTYPE> {
+public:
+ TypedCollecter(int n, SEXP types_) :
+ Collecter_Impl<RTYPE>(n), types(types_) {}
+
+ inline SEXP get() {
+ Vector<RTYPE> data = Collecter_Impl<RTYPE>::data;
+ set_class(data, types);
+ return data;
+ }
+
+ inline bool compatible(SEXP x) {
+ String type = STRING_ELT(types, 0);
+ return Rf_inherits(x, type.get_cstring()) || all_logical_na(x, TYPEOF(x));
+ }
+
+ inline bool can_promote(SEXP) const {
+ return false;
+ }
+
+ std::string describe() const {
+ return collapse_utf8<STRSXP>(types);
+ }
+
+private:
+ SEXP types;
+};
+
+class POSIXctCollecter : public Collecter_Impl<REALSXP> {
+public:
+ typedef Collecter_Impl<REALSXP> Parent;
+
+ POSIXctCollecter(int n, SEXP tz_) :
+ Parent(n), tz(tz_) {}
+
+ void collect(const SlicingIndex& index, SEXP v, int offset = 0) {
+ if (Rf_inherits(v, "POSIXct")) {
+ Parent::collect(index, v, offset);
+ update_tz(v);
+ } else if (all_logical_na(v, TYPEOF(v))) {
+ Parent::collect(index, v, offset);
+ }
+ }
- inline SEXP get() {
- data.attr( "levels" ) = levels ;
- data.attr( "class" ) = model.attr("class") ;
- return data ;
- }
+ inline SEXP get() {
+ set_class(data, get_time_classes());
+ if (!tz.isNULL()) {
+ Parent::data.attr("tzone") = tz;
+ }
+ return Parent::data;
+ }
+
+ inline bool compatible(SEXP x) {
+ return Rf_inherits(x, "POSIXct") || all_logical_na(x, TYPEOF(x));
+ }
+
+ inline bool can_promote(SEXP) const {
+ return false;
+ }
+
+ std::string describe() const {
+ return collapse_utf8<STRSXP>(get_time_classes());
+ }
+
+private:
+ void update_tz(SEXP v) {
+ RObject v_tz(Rf_getAttrib(v, Rf_install("tzone")));
+ // if the new tz is NULL, keep previous value
+ if (v_tz.isNULL()) return;
+
+ if (tz.isNULL()) {
+ // if current tz is NULL, grab the new one
+ tz = v_tz;
+ } else {
+ // none are NULL, so compare them
+ // if they are equal, fine
+ if (STRING_ELT(tz, 0) == STRING_ELT(v_tz, 0)) return;
+
+ // otherwise, settle to UTC
+ tz = wrap("UTC");
+ }
+ }
- inline bool compatible(SEXP x) {
- return Rf_inherits( x, "factor" ) && has_same_levels_as(x) ;
- }
+ RObject tz;
+};
- inline bool can_promote(SEXP x) const {
- return TYPEOF(x) == STRSXP || Rf_inherits( x, "factor" ) ;
- }
+class DifftimeCollecter : public Collecter_Impl<REALSXP> {
+public:
+ typedef Collecter_Impl<REALSXP> Parent;
- inline bool has_same_levels_as( SEXP x) const {
- CharacterVector levels_other = Rf_getAttrib( x, Rf_install( "levels" ) ) ;
+ DifftimeCollecter(int n, std::string units_, SEXP types_) :
+ Parent(n), units(units_), types(types_) {}
- int nlevels = levels_other.size() ;
- if( nlevels != (int)levels_map.size() ) return false ;
+ void collect(const SlicingIndex& index, SEXP v, int offset = 0) {
+ if (Rf_inherits(v, "difftime")) {
+ collect_difftime(index, v, offset);
+ } else if (all_logical_na(v, TYPEOF(v))) {
+ Parent::collect(index, v, offset);
+ }
+ }
+
+ inline SEXP get() {
+ set_class(Parent::data, types);
+ Parent::data.attr("units") = wrap(units);
+ return Parent::data;
+ }
+
+ inline bool compatible(SEXP x) {
+ return Rf_inherits(x, "difftime") || all_logical_na(x, TYPEOF(x));
+ }
+
+ inline bool can_promote(SEXP) const {
+ return false;
+ }
+
+ std::string describe() const {
+ return collapse_utf8<STRSXP>(types);
+ }
+
+private:
+ bool is_valid_difftime(RObject x) {
+ return
+ x.inherits("difftime") &&
+ x.sexp_type() == REALSXP &&
+ get_units_map().is_valid_difftime_unit(Rcpp::as<std::string>(x.attr("units")));
+ }
+
+
+ void collect_difftime(const SlicingIndex& index, RObject v, int offset = 0) {
+ if (!is_valid_difftime(v)) {
+ stop("Invalid difftime object");
+ }
+ std::string v_units = Rcpp::as<std::string>(v.attr("units"));
+ if (!get_units_map().is_valid_difftime_unit(units)) {
+ // if current unit is NULL, grab the new one
+ units = v_units;
+ // then collect the data:
+ Parent::collect(index, v, offset);
+ } else {
+ // We had already defined the units.
+ // Does the new vector have the same units?
+ if (units == v_units) {
+ Parent::collect(index, v, offset);
+ } else {
+ // If units are different convert the existing data and the new vector
+ // to seconds (following the convention on
+ // r-source/src/library/base/R/datetime.R)
+ double factor_data = get_units_map().time_conversion_factor(units);
+ if (factor_data != 1.0) {
+ for (int i = 0; i < Parent::data.size(); i++) {
+ Parent::data[i] = factor_data * Parent::data[i];
+ }
+ }
+ units = "secs";
+ double factor_v = get_units_map().time_conversion_factor(v_units);
+ if (Rf_length(v) < index.size()) {
+ stop("Wrong size of vector to collect");
+ }
+ for (int i = 0; i < index.size(); i++) {
+ Parent::data[index[i]] = factor_v * (REAL(v)[i + offset]);
+ }
+ }
+ }
+ }
+
+ class UnitsMap {
+ typedef std::map<std::string, double> units_map;
+ const units_map valid_units;
+
+ static units_map create_valid_units() {
+ units_map valid_units;
+ double factor = 1;
+
+ // Acceptable units based on r-source/src/library/base/R/datetime.R
+ valid_units.insert(std::make_pair("secs", factor));
+ factor *= 60;
+ valid_units.insert(std::make_pair("mins", factor));
+ factor *= 60;
+ valid_units.insert(std::make_pair("hours", factor));
+ factor *= 24;
+ valid_units.insert(std::make_pair("days", factor));
+ factor *= 7;
+ valid_units.insert(std::make_pair("weeks", factor));
+ return valid_units;
+ }
- for( int i=0; i<nlevels; i++)
- if( ! levels_map.count(levels_other[i]) )
- return false ;
- return true ;
- }
+ public:
+ UnitsMap() : valid_units(create_valid_units()) {}
- inline std::string describe() const {
- return "factor" ;
- }
+ bool is_valid_difftime_unit(const std::string& x_units) const {
+ return (valid_units.find(x_units) != valid_units.end());
+ }
- private:
- IntegerVector data ;
- RObject model ;
- CharacterVector levels ;
- LevelsMap levels_map ;
- } ;
+ double time_conversion_factor(const std::string& v_units) const {
+ units_map::const_iterator it = valid_units.find(v_units);
+ if (it == valid_units.end()) {
+ stop("Invalid difftime units (%s).", v_units.c_str());
+ }
- template <>
- inline bool Collecter_Impl<LGLSXP>::can_promote( SEXP x) const {
- return ( TYPEOF(x) == INTSXP && ! Rf_inherits(x, "factor" ) ) || TYPEOF(x) == REALSXP ;
+ return it->second;
}
+ };
+
+ static const UnitsMap& get_units_map() {
+ static UnitsMap map;
+ return map;
+ }
+
+private:
+ std::string units;
+ SEXP types;
+
+};
+
+
+class FactorCollecter : public Collecter {
+public:
+ typedef dplyr_hash_map<SEXP, int> LevelsMap;
+
+ FactorCollecter(int n, SEXP model_):
+ data(n, IntegerVector::get_na()),
+ model(model_),
+ levels(get_levels(model_)),
+ levels_map()
+ {
+ int nlevels = levels.size();
+ for (int i = 0; i < nlevels; i++) levels_map[ levels[i] ] = i + 1;
+ }
+
+ bool is_factor_collecter() const {
+ return true;
+ }
+
+ void collect(const SlicingIndex& index, SEXP v, int offset = 0) {
+ if (offset != 0) stop("Nonzero offset ot supported by FactorCollecter");
+ if (Rf_inherits(v, "factor") && has_same_levels_as(v)) {
+ collect_factor(index, v);
+ } else if (all_logical_na(v, TYPEOF(v))) {
+ collect_logicalNA(index);
+ }
+ }
+
+ inline SEXP get() {
+ set_levels(data, levels);
+ set_class(data, get_class(model));
+ return data;
+ }
+
+ inline bool compatible(SEXP x) {
+ return ((Rf_inherits(x, "factor") && has_same_levels_as(x)) ||
+ all_logical_na(x, TYPEOF(x)));
+ }
+
+ inline bool can_promote(SEXP x) const {
+ return TYPEOF(x) == STRSXP || Rf_inherits(x, "factor");
+ }
+
+ inline bool has_same_levels_as(SEXP x) const {
+ CharacterVector levels_other = get_levels(x);
+
+ int nlevels = levels_other.size();
+ if (nlevels != (int)levels_map.size()) return false;
+
+ for (int i = 0; i < nlevels; i++)
+ if (! levels_map.count(levels_other[i]))
+ return false;
+ return true;
+ }
+
+ inline std::string describe() const {
+ return "factor";
+ }
+
+private:
+ IntegerVector data;
+ RObject model;
+ CharacterVector levels;
+ LevelsMap levels_map;
+
+ void collect_factor(const SlicingIndex& index, SEXP v) {
+ // here we can assume that v is a factor with the right levels
+ // we however do not assume that they are in the same order
+ IntegerVector source(v);
+ CharacterVector levels = get_levels(source);
+ SEXP* levels_ptr = Rcpp::internal::r_vector_start<STRSXP>(levels);
+ int* source_ptr = Rcpp::internal::r_vector_start<INTSXP>(source);
+ for (int i = 0; i < index.size(); i++) {
+ if (source_ptr[i] == NA_INTEGER) {
+ data[ index[i] ] = NA_INTEGER;
+ } else {
+ SEXP x = levels_ptr[ source_ptr[i] - 1 ];
+ data[ index[i] ] = levels_map.find(x)->second;
+ }
+ }
+ }
- inline Collecter* collecter(SEXP model, int n){
- switch( TYPEOF(model) ){
- case INTSXP:
- if( Rf_inherits( model, "POSIXct" ) )
- return new POSIXctCollecter(n, Rf_getAttrib(model, Rf_install("tzone") ) ) ;
- if( Rf_inherits(model, "factor") )
- return new FactorCollecter(n, model ) ;
- if( Rf_inherits(model, "Date") )
- return new TypedCollecter<INTSXP>(n, get_date_classes()) ;
- return new Collecter_Impl<INTSXP>(n) ;
- case REALSXP:
- if( Rf_inherits( model, "POSIXct" ) )
- return new POSIXctCollecter(n, Rf_getAttrib(model, Rf_install("tzone") ) ) ;
- if( Rf_inherits( model, "Date" ) )
- return new TypedCollecter<REALSXP>(n, get_date_classes()) ;
- return new Collecter_Impl<REALSXP>(n) ;
- case CPLXSXP:
- return new Collecter_Impl<CPLXSXP>(n) ;
- case LGLSXP: return new Collecter_Impl<LGLSXP>(n) ;
- case STRSXP: return new Collecter_Impl<STRSXP>(n) ;
- case VECSXP:
- if( Rf_inherits( model, "POSIXlt" )) {
- stop( "POSIXlt not supported" ) ;
- }
- return new Collecter_Impl<VECSXP>(n) ;
- default: break ;
- }
-
- stop("Unsupported vector type %s", Rf_type2char(TYPEOF(model))) ;
- return 0 ;
+ void collect_logicalNA(const SlicingIndex& index) {
+ for (int i = 0; i < index.size(); i++) {
+ data[ index[i] ] = NA_INTEGER;
}
+ }
+};
- inline Collecter* promote_collecter(SEXP model, int n, Collecter* previous){
- // handle the case where the previous collecter was a
- // Factor collecter and model is a factor. when this occurs, we need to
- // return a Collecter_Impl<STRSXP> because the factors don't have the
- // same levels
- if( Rf_inherits( model, "factor" ) && previous->is_factor_collecter() ){
- Rf_warning( "Unequal factor levels: coercing to character" ) ;
- return new Collecter_Impl<STRSXP>(n) ;
- }
+template <>
+inline bool Collecter_Impl<LGLSXP>::can_promote(SEXP) const {
+ return is_logical_all_na();
+}
- switch( TYPEOF(model) ){
- case INTSXP:
- if( Rf_inherits( model, "Date" ) )
- return new TypedCollecter<INTSXP>(n, get_date_classes() ) ;
- if( Rf_inherits(model, "factor") )
- return new Collecter_Impl<STRSXP>(n) ;
- return new Collecter_Impl<INTSXP>(n) ;
- case REALSXP:
- if( Rf_inherits( model, "POSIXct" ) )
- return new POSIXctCollecter(n, Rf_getAttrib(model, Rf_install("tzone") ) ) ;
- if( Rf_inherits( model, "Date" ) )
- return new TypedCollecter<REALSXP>(n, get_date_classes() ) ;
- return new Collecter_Impl<REALSXP>(n) ;
- case LGLSXP: return new Collecter_Impl<LGLSXP>(n) ;
- case STRSXP:
- if( previous->is_factor_collecter() )
- Rf_warning("binding factor and character vector, coercing into character vector") ;
- return new Collecter_Impl<STRSXP>(n) ;
- default: break ;
- }
- stop("Unsupported vector type %s", Rf_type2char(TYPEOF(model))) ;
- return 0 ;
+inline Collecter* collecter(SEXP model, int n) {
+ switch (TYPEOF(model)) {
+ case INTSXP:
+ if (Rf_inherits(model, "POSIXct"))
+ return new POSIXctCollecter(n, Rf_getAttrib(model, Rf_install("tzone")));
+ if (Rf_inherits(model, "factor"))
+ return new FactorCollecter(n, model);
+ if (Rf_inherits(model, "Date"))
+ return new TypedCollecter<INTSXP>(n, get_date_classes());
+ return new Collecter_Impl<INTSXP>(n);
+ case REALSXP:
+ if (Rf_inherits(model, "POSIXct"))
+ return new POSIXctCollecter(n, Rf_getAttrib(model, Rf_install("tzone")));
+ if (Rf_inherits(model, "difftime"))
+ return
+ new DifftimeCollecter(
+ n,
+ Rcpp::as<std::string>(Rf_getAttrib(model, Rf_install("units"))),
+ Rf_getAttrib(model, R_ClassSymbol));
+ if (Rf_inherits(model, "Date"))
+ return new TypedCollecter<REALSXP>(n, get_date_classes());
+ if (Rf_inherits(model, "integer64"))
+ return new TypedCollecter<REALSXP>(n, CharacterVector::create("integer64"));
+ return new Collecter_Impl<REALSXP>(n);
+ case CPLXSXP:
+ return new Collecter_Impl<CPLXSXP>(n);
+ case LGLSXP:
+ return new Collecter_Impl<LGLSXP>(n);
+ case STRSXP:
+ return new Collecter_Impl<STRSXP>(n);
+ case VECSXP:
+ if (Rf_inherits(model, "POSIXlt")) {
+ stop("POSIXlt not supported");
+ }
+ if (Rf_inherits(model, "data.frame")) {
+ stop("Columns of class data.frame not supported");
}
+ return new Collecter_Impl<VECSXP>(n);
+ default:
+ break;
+ }
+ stop("is of unsupported type %s", Rf_type2char(TYPEOF(model)));
+}
+
+inline Collecter* promote_collecter(SEXP model, int n, Collecter* previous) {
+ // handle the case where the previous collecter was a
+ // Factor collecter and model is a factor. when this occurs, we need to
+ // return a Collecter_Impl<STRSXP> because the factors don't have the
+ // same levels
+ if (Rf_inherits(model, "factor") && previous->is_factor_collecter()) {
+ Rf_warning("Unequal factor levels: coercing to character");
+ return new Collecter_Impl<STRSXP>(n);
+ }
+
+ // logical NA can be promoted to whatever type comes next
+ if (previous->is_logical_all_na()) {
+ return collecter(model, n);
+ }
+
+ switch (TYPEOF(model)) {
+ case INTSXP:
+ if (Rf_inherits(model, "Date"))
+ return new TypedCollecter<INTSXP>(n, get_date_classes());
+ if (Rf_inherits(model, "factor"))
+ return new Collecter_Impl<STRSXP>(n);
+ return new Collecter_Impl<INTSXP>(n);
+ case REALSXP:
+ if (Rf_inherits(model, "POSIXct"))
+ return new POSIXctCollecter(n, Rf_getAttrib(model, Rf_install("tzone")));
+ if (Rf_inherits(model, "Date"))
+ return new TypedCollecter<REALSXP>(n, get_date_classes());
+ if (Rf_inherits(model, "integer64"))
+ return new TypedCollecter<REALSXP>(n, CharacterVector::create("integer64"));
+ return new Collecter_Impl<REALSXP>(n);
+ case LGLSXP:
+ return new Collecter_Impl<LGLSXP>(n);
+ case STRSXP:
+ if (previous->is_factor_collecter())
+ Rf_warning("binding factor and character vector, coercing into character vector");
+ return new Collecter_Impl<STRSXP>(n);
+ default:
+ break;
+ }
+ stop("is of unsupported type %s", Rf_type2char(TYPEOF(model)));
+}
}
diff --git a/inst/include/dplyr/Column.h b/inst/include/dplyr/Column.h
new file mode 100644
index 0000000..5dcbf00
--- /dev/null
+++ b/inst/include/dplyr/Column.h
@@ -0,0 +1,26 @@
+#ifndef DPLYR_DPLYR_COLUMN_H
+#define DPLYR_DPLYR_COLUMN_H
+
+class Column {
+public:
+ Column(SEXP data_, const SymbolString& name_) : data(data_), name(name_) {}
+
+public:
+ const RObject& get_data() const {
+ return data;
+ }
+
+ const SymbolString& get_name() const {
+ return name;
+ }
+
+ Column update_data(SEXP new_data) const {
+ return Column(new_data, name);
+ }
+
+private:
+ RObject data;
+ SymbolString name;
+};
+
+#endif //DPLYR_DPLYR_COLUMN_H
diff --git a/inst/include/dplyr/DataFrameAble.h b/inst/include/dplyr/DataFrameAble.h
deleted file mode 100644
index 12a1be1..0000000
--- a/inst/include/dplyr/DataFrameAble.h
+++ /dev/null
@@ -1,150 +0,0 @@
-#ifndef dplyr_DataFrameAble_H
-#define dplyr_DataFrameAble_H
-
-namespace dplyr {
-
- class DataFrameAbleImpl {
- public:
- virtual ~DataFrameAbleImpl(){} ;
- virtual int nrows() const = 0 ;
- virtual SEXP get( int i ) const = 0 ;
- virtual int size() const = 0 ;
- virtual CharacterVector names() const = 0 ;
- virtual bool is_dataframe() const = 0 ;
- virtual SEXP get() const = 0 ;
- } ;
-
- class DataFrameAble_DataFrame : public DataFrameAbleImpl {
- public:
- DataFrameAble_DataFrame( DataFrame data_) : data(data_){
- if( data.size() ){
- CharacterVector df_names = data.names() ;
- if( any(is_na(df_names)).is_true() ){
- stop( "corrupt data frame" ) ;
- }
- }
- }
-
- inline int nrows() const {
- return data.nrows() ;
- }
-
- inline SEXP get(int i) const {
- return data[i] ;
- }
-
- inline int size() const {
- return data.size() ;
- }
-
- inline CharacterVector names() const {
- return data.names() ;
- }
-
- inline bool is_dataframe() const {
- return true ;
- }
-
- inline SEXP get() const {
- return data ;
- }
-
- private:
- DataFrame data ;
- } ;
-
- class DataFrameAble_List : public DataFrameAbleImpl {
- public:
- DataFrameAble_List( SEXP data_) : data(data_), nr(0){
- int n = data.size() ;
- if( data.size() == 0) return ;
- nr = Rf_length(data[0]) ;
- for(int i=1; i<n; i++){
- if( Rf_length(data[i]) != nr ) {
- stop( "incompatible sizes (%d != %s)", nr, Rf_length(data[i]) ) ;
- }
- }
- }
-
- inline int nrows() const {
- return nr ;
- }
-
- inline SEXP get(int i) const {
- return data[i] ;
- }
-
- inline int size() const {
- return data.size() ;
- }
-
- inline CharacterVector names() const {
- return data.names() ;
- }
-
- inline bool is_dataframe() const {
- return false ;
- }
-
- inline SEXP get() const {
- return data ;
- }
-
- private:
- List data ;
- int nr ;
- } ;
-
- class DataFrameAble{
- public:
- DataFrameAble( SEXP data ) {
- init(data) ;
- }
- DataFrameAble( List::Proxy data){
- init( (SEXP)data) ;
- }
-
-
- inline int nrows() const {
- return impl->nrows() ;
- }
-
- inline int size() const {
- return impl->size() ;
- }
-
- inline SEXP get( int i ) const {
- return impl->get(i) ;
- }
-
- inline CharacterVector names() const {
- return impl->names() ;
- }
-
- inline bool is_dataframe() const {
- return impl->is_dataframe() ;
- }
-
- inline SEXP get() const {
- return impl->get() ;
- }
-
- private:
- boost::shared_ptr<DataFrameAbleImpl> impl ;
-
- inline void init( SEXP data){
- if( Rf_inherits( data, "data.frame")){
- impl.reset( new DataFrameAble_DataFrame(data)) ;
- } else if( is<List>(data) ){
- impl.reset( new DataFrameAble_List(data) ) ;
- } else {
- stop( "cannot convert object to a data frame" ) ;
- }
- }
-
- } ;
-
-}
-
-
-#endif
diff --git a/inst/include/dplyr/DataFrameColumnSubsetVisitor.h b/inst/include/dplyr/DataFrameColumnSubsetVisitor.h
index 560cc9c..b863115 100644
--- a/inst/include/dplyr/DataFrameColumnSubsetVisitor.h
+++ b/inst/include/dplyr/DataFrameColumnSubsetVisitor.h
@@ -1,52 +1,50 @@
#ifndef dplyr_DataFrameColumnSubsetVisitors_H
#define dplyr_DataFrameColumnSubsetVisitors_H
-namespace dplyr {
+#include <dplyr/SubsetVectorVisitor.h>
- class DataFrameColumnSubsetVisitor : public SubsetVectorVisitor {
- public:
- DataFrameColumnSubsetVisitor( const DataFrame& data_ ) : data(data_), visitors(data) {}
+namespace dplyr {
- inline SEXP subset( const Rcpp::IntegerVector& index ) const {
- return visitors.subset( index, data.attr("class") ) ;
- }
+class DataFrameColumnSubsetVisitor : public SubsetVectorVisitor {
+public:
+ DataFrameColumnSubsetVisitor(const DataFrame& data_) : data(data_), visitors(data) {}
- inline SEXP subset( const std::vector<int>& index ) const {
- return visitors.subset( index, data.attr("class") ) ;
- }
+ inline SEXP subset(const Rcpp::IntegerVector& index) const {
+ return visitors.subset(index, get_class(data));
+ }
- inline SEXP subset( const SlicingIndex& index ) const {
- return visitors.subset( index, data.attr("class") ) ;
- }
+ inline SEXP subset(const std::vector<int>& index) const {
+ return visitors.subset(index, get_class(data));
+ }
- inline SEXP subset( const ChunkIndexMap& index ) const {
- return visitors.subset( index, data.attr("class") ) ;
- }
+ inline SEXP subset(const SlicingIndex& index) const {
+ return visitors.subset(index, get_class(data));
+ }
- inline SEXP subset( const Rcpp::LogicalVector& index ) const {
- return visitors.subset( index, data.attr("class") ) ;
- }
+ inline SEXP subset(const ChunkIndexMap& index) const {
+ return visitors.subset(index, get_class(data));
+ }
- inline SEXP subset( EmptySubset index ) const {
- return visitors.subset( index, data.attr("class") );
- }
+ inline SEXP subset(EmptySubset index) const {
+ return visitors.subset(index, get_class(data));
+ }
- inline int size() const {
- return visitors.nrows() ;
- }
+ inline int size() const {
+ return visitors.nrows();
+ }
- inline std::string get_r_type() const {
- return "data.frame" ;
- }
+ inline std::string get_r_type() const {
+ return "data.frame";
+ }
- inline bool is_compatible( SubsetVectorVisitor* other, std::stringstream&, const std::string& ) const {
- return true ;
- }
+ inline bool is_compatible(SubsetVectorVisitor* other, std::stringstream&, const SymbolString&) const {
+ return is_same_typeid(other);
+ }
- private:
- DataFrame data ;
- DataFrameSubsetVisitors visitors ;
- } ;
+private:
+ DataFrame data;
+ DataFrameSubsetVisitors visitors;
+};
}
diff --git a/inst/include/dplyr/DataFrameColumnVisitor.h b/inst/include/dplyr/DataFrameColumnVisitor.h
index 136c005..b5d8fe5 100644
--- a/inst/include/dplyr/DataFrameColumnVisitor.h
+++ b/inst/include/dplyr/DataFrameColumnVisitor.h
@@ -1,52 +1,50 @@
#ifndef dplyr_DataFrameColumnVisitors_H
#define dplyr_DataFrameColumnVisitors_H
-namespace dplyr {
+#include <dplyr/DataFrameVisitors.h>
- class DataFrameColumnVisitor : public VectorVisitor {
- public:
- DataFrameColumnVisitor( const DataFrame& data_ ) : data(data_), visitors(data) {}
+namespace dplyr {
- inline size_t hash(int i) const {
- return visitors.hash(i) ;
- }
+class DataFrameColumnVisitor : public VectorVisitor {
+public:
+ DataFrameColumnVisitor(const DataFrame& data_) : data(data_), visitors(data) {}
- inline bool equal(int i, int j) const {
- return visitors.equal(i,j) ;
- }
+ inline size_t hash(int i) const {
+ return visitors.hash(i);
+ }
- inline bool equal_or_both_na(int i, int j) const {
- return visitors.equal_or_both_na(i,j) ;
- }
+ inline bool equal(int i, int j) const {
+ return visitors.equal(i, j);
+ }
- inline bool less( int i, int j ) const {
- return visitors.less(i,j) ;
- }
+ inline bool equal_or_both_na(int i, int j) const {
+ return visitors.equal_or_both_na(i, j);
+ }
- inline bool greater( int i, int j ) const {
- return visitors.greater(i,j) ;
- }
+ inline bool less(int i, int j) const {
+ return visitors.less(i, j);
+ }
- virtual int size() const {
- return visitors.nrows() ;
- }
+ inline bool greater(int i, int j) const {
+ return visitors.greater(i, j);
+ }
- virtual std::string get_r_type() const {
- return "data.frame" ;
- }
+ virtual int size() const {
+ return visitors.nrows();
+ }
- virtual bool is_compatible( VectorVisitor* other, std::stringstream&, const std::string& ) const {
- return true ;
- }
+ virtual std::string get_r_type() const {
+ return "data.frame";
+ }
- bool is_na( int i ) const {
- return false ;
- }
+ bool is_na(int) const {
+ return false;
+ }
- private:
- DataFrame data ;
- DataFrameVisitors visitors ;
- } ;
+private:
+ DataFrame data;
+ DataFrameVisitors visitors;
+};
}
diff --git a/inst/include/dplyr/DataFrameJoinVisitors.h b/inst/include/dplyr/DataFrameJoinVisitors.h
index e67939c..6f85cd4 100644
--- a/inst/include/dplyr/DataFrameJoinVisitors.h
+++ b/inst/include/dplyr/DataFrameJoinVisitors.h
@@ -1,67 +1,76 @@
#ifndef dplyr_DataFrameJoinVisitors_H
#define dplyr_DataFrameJoinVisitors_H
-namespace dplyr{
+#include <tools/pointer_vector.h>
- class DataFrameJoinVisitors :
- public VisitorSetEqual<DataFrameJoinVisitors>,
- public VisitorSetHash<DataFrameJoinVisitors>
- {
- public:
- typedef JoinVisitor visitor_type ;
+#include <dplyr/visitor_set/VisitorSetMixin.h>
- DataFrameJoinVisitors(
- const Rcpp::DataFrame& left_,
- const Rcpp::DataFrame& right_,
- Rcpp::CharacterVector names_left,
- Rcpp::CharacterVector names_right,
- bool warn_
- ) ;
+#include <dplyr/tbl_cpp.h>
+#include <dplyr/JoinVisitor.h>
- inline JoinVisitor* get(int k) const {
- return visitors[k] ;
- }
- inline JoinVisitor* get( String name ) const {
- for( int i=0; i<nvisitors; i++){
- if( name == visitor_names_left[i] ) return get(i) ;
- }
- stop("visitor not found for name '%s' ", name.get_cstring() ) ;
- return 0 ;
- }
- inline int size() const{
- return nvisitors ;
- }
+namespace dplyr {
- template <typename Container>
- inline DataFrame subset( const Container& index, const CharacterVector& classes ){
- int nrows = index.size() ;
- Rcpp::List out(nvisitors);
- for( int k=0; k<nvisitors; k++){
- out[k] = get(k)->subset(index) ;
- }
- out.attr( "class" ) = classes ;
- set_rownames(out, nrows) ;
- out.names() = visitor_names_left ;
- SEXP vars = left.attr( "vars" ) ;
- if( !Rf_isNull(vars) )
- out.attr( "vars" ) = vars ;
- return (SEXP)out ;
- }
+class DataFrameJoinVisitors :
+ public VisitorSetEqual<DataFrameJoinVisitors>,
+ public VisitorSetHash<DataFrameJoinVisitors>
+{
+public:
+ typedef JoinVisitor visitor_type;
- const CharacterVector& left_names() const { return visitor_names_left ;}
- const CharacterVector& right_names() const { return visitor_names_right ;}
+ DataFrameJoinVisitors(
+ const DataFrame& left_,
+ const DataFrame& right_,
+ const SymbolVector& names_left,
+ const SymbolVector& names_right,
+ bool warn_,
+ bool na_match
+ );
- private:
- const DataFrame& left ;
- const DataFrame& right ;
- CharacterVector visitor_names_left ;
- CharacterVector visitor_names_right ;
+ inline JoinVisitor* get(int k) const {
+ return visitors[k];
+ }
+ inline JoinVisitor* get(const SymbolString& name) const {
+ for (int i = 0; i < nvisitors; i++) {
+ if (name == visitor_names_left[i]) return get(i);
+ }
+ stop("visitor not found for name '%s' ", name.get_utf8_cstring());
+ }
+ inline int size() const {
+ return nvisitors;
+ }
- int nvisitors ;
- pointer_vector<JoinVisitor> visitors ;
- bool warn ;
+ template <typename Container>
+ inline DataFrame subset(const Container& index, const CharacterVector& classes) {
+ int nrows = index.size();
+ Rcpp::List out(nvisitors);
+ for (int k = 0; k < nvisitors; k++) {
+ out[k] = get(k)->subset(index);
+ }
+ set_class(out, classes);
+ set_rownames(out, nrows);
+ out.names() = visitor_names_left;
+ copy_vars(out, left);
+ return (SEXP)out;
+ }
- } ;
+ const SymbolVector& left_names() const {
+ return visitor_names_left;
+ }
+ const SymbolVector& right_names() const {
+ return visitor_names_right;
+ }
+
+private:
+ const DataFrame& left;
+ const DataFrame& right;
+ SymbolVector visitor_names_left;
+ SymbolVector visitor_names_right;
+
+ int nvisitors;
+ pointer_vector<JoinVisitor> visitors;
+ bool warn;
+
+};
}
diff --git a/inst/include/dplyr/DataFrameSubsetVisitors.h b/inst/include/dplyr/DataFrameSubsetVisitors.h
index 6d3fd30..d7c2bff 100644
--- a/inst/include/dplyr/DataFrameSubsetVisitors.h
+++ b/inst/include/dplyr/DataFrameSubsetVisitors.h
@@ -1,121 +1,127 @@
#ifndef dplyr_DataFrameSubsetVisitors_H
#define dplyr_DataFrameSubsetVisitors_H
+#include <tools/pointer_vector.h>
+#include <tools/match.h>
+#include <tools/utils.h>
+
+#include <dplyr/tbl_cpp.h>
+#include <dplyr/subset_visitor.h>
+#include <dplyr/bad.h>
+
namespace dplyr {
- class DataFrameSubsetVisitors {
- private:
-
- const Rcpp::DataFrame& data ;
- pointer_vector<SubsetVectorVisitor> visitors ;
- Rcpp::CharacterVector visitor_names ;
- int nvisitors ;
-
- public:
- typedef SubsetVectorVisitor visitor_type ;
-
- DataFrameSubsetVisitors( const Rcpp::DataFrame& data_) :
- data(data_),
- visitors(),
- visitor_names(data.names()),
- nvisitors(visitor_names.size())
- {
-
- for( int i=0; i<nvisitors; i++){
- SubsetVectorVisitor* v = subset_visitor( data[i] ) ;
- visitors.push_back(v) ;
- }
- }
-
- DataFrameSubsetVisitors( const Rcpp::DataFrame& data_, const Rcpp::CharacterVector& names ) :
- data(data_),
- visitors(),
- visitor_names(names),
- nvisitors(visitor_names.size())
- {
-
- std::string name ;
- int n = names.size() ;
- for( int i=0; i<n; i++){
- name = (String)names[i] ;
- SEXP column ;
-
- try{
- column = data[name] ;
- } catch( ... ){
- stop( "unknown column '%s' ", name ) ;
- }
- SubsetVectorVisitor* v = subset_visitor( column ) ;
- visitors.push_back(v) ;
-
- }
-
- }
-
- template <typename Container>
- DataFrame subset_impl( const Container& index, const CharacterVector& classes, traits::false_type ) const {
- List out(nvisitors);
- for( int k=0; k<nvisitors; k++){
- out[k] = get(k)->subset(index) ;
- }
- copy_most_attributes( out, data ) ;
- structure( out, Rf_length(out[0]) , classes) ;
- return out ;
- }
-
- template <typename Container>
- DataFrame subset_impl( const Container& index, const CharacterVector& classes, traits::true_type ) const {
- int n = index.size() ;
- int n_out = std::count( index.begin(), index.end(), TRUE ) ;
- IntegerVector idx = no_init(n_out) ;
- for(int i=0, k=0; i<n; i++){
- if( index[i] == TRUE ){
- idx[k++] = i ;
- }
- }
- return subset_impl( idx, classes, traits::false_type() ) ;
- }
-
- template <typename Container>
- inline DataFrame subset( const Container& index, const CharacterVector& classes ) const {
- return subset_impl( index, classes,
- typename traits::same_type<Container, LogicalVector>::type()
- ) ;
- }
-
- inline int size() const { return nvisitors ; }
- inline SubsetVectorVisitor* get(int k) const { return visitors[k] ; }
-
- Rcpp::String name(int k) const { return visitor_names[k] ; }
-
- inline int nrows() const { return data.nrows() ; }
-
- private:
-
- inline void structure( List& x, int nrows, CharacterVector classes ) const {
- x.attr( "class" ) = classes ;
- set_rownames(x, nrows) ;
- x.names() = visitor_names ;
- SEXP vars = data.attr( "vars" ) ;
- if( !Rf_isNull(vars) )
- x.attr( "vars" ) = vars ;
- }
-
- } ;
-
- inline DataFrame subset( DataFrame data, LogicalVector test, CharacterVector select, CharacterVector classes ){
- DataFrameSubsetVisitors visitors( data, select ) ;
- return visitors.subset(test, classes ) ;
+class DataFrameSubsetVisitors {
+private:
+
+ const Rcpp::DataFrame& data;
+ pointer_vector<SubsetVectorVisitor> visitors;
+ SymbolVector visitor_names;
+ int nvisitors;
+
+public:
+ typedef SubsetVectorVisitor visitor_type;
+
+ DataFrameSubsetVisitors(const Rcpp::DataFrame& data_) :
+ data(data_),
+ visitors(),
+ visitor_names(data.names()),
+ nvisitors(visitor_names.size())
+ {
+ CharacterVector names = data.names();
+ for (int i = 0; i < nvisitors; i++) {
+ SubsetVectorVisitor* v = subset_visitor(data[i], names[i]);
+ visitors.push_back(v);
+ }
+ }
+
+ DataFrameSubsetVisitors(const DataFrame& data_, const SymbolVector& names) :
+ data(data_),
+ visitors(),
+ visitor_names(names),
+ nvisitors(visitor_names.size())
+ {
+
+ CharacterVector data_names = data.names();
+ IntegerVector indx = names.match_in_table(data_names);
+
+ int n = indx.size();
+ for (int i = 0; i < n; i++) {
+
+ int pos = indx[i];
+ if (pos == NA_INTEGER) {
+ bad_col(names[i], "is unknown");
+ }
+
+ SubsetVectorVisitor* v = subset_visitor(data[pos - 1], data_names[pos - 1]);
+ visitors.push_back(v);
+
}
- inline DataFrame subset( DataFrame data, LogicalVector test, CharacterVector classes ){
- DataFrameSubsetVisitors visitors( data ) ;
- DataFrame res = visitors.subset(test, classes ) ;
- return res ;
+ }
+
+ template <typename Container>
+ DataFrame subset(const Container& index, const CharacterVector& classes) const {
+ List out(nvisitors);
+ for (int k = 0; k < nvisitors; k++) {
+ out[k] = get(k)->subset(index);
+ }
+ copy_most_attributes(out, data);
+ structure(out, output_size(index), classes);
+ return out;
+ }
+
+ inline int size() const {
+ return nvisitors;
+ }
+ inline SubsetVectorVisitor* get(int k) const {
+ return visitors[k];
+ }
+
+ const SymbolString name(int k) const {
+ return visitor_names[k];
+ }
+
+ inline int nrows() const {
+ return data.nrows();
+ }
+
+private:
+
+ inline void structure(List& x, int nrows, CharacterVector classes) const {
+ set_class(x, classes);
+ set_rownames(x, nrows);
+ x.names() = visitor_names;
+ copy_vars(x, data);
+ }
+
+};
+
+template <>
+inline DataFrame DataFrameSubsetVisitors::subset(const LogicalVector& index, const CharacterVector& classes) const {
+ const int n = index.size();
+ std::vector<int> idx;
+ idx.reserve(n);
+ for (int i = 0; i < n; i++) {
+ if (index[i] == TRUE) {
+ idx.push_back(i);
}
+ }
+ return subset(idx, classes);
+}
+
+template <typename Index>
+DataFrame subset(DataFrame df, const Index& indices, const SymbolVector& columns, const CharacterVector& classes) {
+ return DataFrameSubsetVisitors(df, columns).subset(indices, classes);
+}
+template <typename Index>
+DataFrame subset(DataFrame df, const Index& indices, CharacterVector classes) {
+ return DataFrameSubsetVisitors(df).subset(indices, classes);
+}
} // namespace dplyr
+#include <dplyr/subset_visitor_impl.h>
#endif
diff --git a/inst/include/dplyr/DataFrameVisitors.h b/inst/include/dplyr/DataFrameVisitors.h
index 6e9442f..1aa56c0 100644
--- a/inst/include/dplyr/DataFrameVisitors.h
+++ b/inst/include/dplyr/DataFrameVisitors.h
@@ -1,40 +1,55 @@
#ifndef dplyr_DataFrameVisitors_H
#define dplyr_DataFrameVisitors_H
+#include <tools/pointer_vector.h>
+
+#include <dplyr/visitor_set/VisitorSetMixin.h>
+
+#include <dplyr/VectorVisitor.h>
+#include <tools/SymbolVector.h>
+
namespace dplyr {
- class DataFrameVisitors :
- public VisitorSetEqual<DataFrameVisitors>,
- public VisitorSetHash<DataFrameVisitors>,
- public VisitorSetLess<DataFrameVisitors>,
- public VisitorSetGreater<DataFrameVisitors> {
+class DataFrameVisitors :
+ public VisitorSetEqual<DataFrameVisitors>,
+ public VisitorSetHash<DataFrameVisitors>,
+ public VisitorSetLess<DataFrameVisitors>,
+ public VisitorSetGreater<DataFrameVisitors> {
- private:
+private:
- const Rcpp::DataFrame& data ;
- pointer_vector<VectorVisitor> visitors ;
- Rcpp::CharacterVector visitor_names ;
- int nvisitors ;
+ const Rcpp::DataFrame& data;
+ pointer_vector<VectorVisitor> visitors;
+ SymbolVector visitor_names;
+ int nvisitors;
- public:
- typedef VectorVisitor visitor_type ;
+public:
+ typedef VectorVisitor visitor_type;
- DataFrameVisitors( const Rcpp::DataFrame& data_) ;
+ DataFrameVisitors(const DataFrame& data_);
- DataFrameVisitors( const Rcpp::DataFrame& data_, const Rcpp::CharacterVector& names ) ;
+ DataFrameVisitors(const DataFrame& data_, const SymbolVector& names);
- inline int size() const { return nvisitors ; }
- inline VectorVisitor* get(int k) const { return visitors[k] ; }
+ inline int size() const {
+ return nvisitors;
+ }
+ inline VectorVisitor* get(int k) const {
+ return visitors[k];
+ }
- Rcpp::String name(int k) const { return visitor_names[k] ; }
+ const SymbolString name(int k) const {
+ return visitor_names[k];
+ }
- inline int nrows() const { return data.nrows() ; }
+ inline int nrows() const {
+ return data.nrows();
+ }
- private:
+private:
- void structure( List& x, int nrows, CharacterVector classes ) const ;
+ void structure(List& x, int nrows, CharacterVector classes) const;
- } ;
+};
} // namespace dplyr
diff --git a/inst/include/dplyr/DataFrameVisitorsIndexMap.h b/inst/include/dplyr/DataFrameVisitorsIndexMap.h
index cd2db01..64c9389 100644
--- a/inst/include/dplyr/DataFrameVisitorsIndexMap.h
+++ b/inst/include/dplyr/DataFrameVisitorsIndexMap.h
@@ -1,9 +1,12 @@
#ifndef dplyr_DataFrameVisitors_map_H
#define dplyr_DataFrameVisitors_map_H
+#include <dplyr/DataFrameVisitors.h>
+#include <dplyr/visitor_set/VisitorSetIndexMap.h>
+
namespace dplyr {
- typedef VisitorSetIndexMap< DataFrameVisitors, std::vector<int> > ChunkIndexMap ;
+typedef VisitorSetIndexMap< DataFrameVisitors, std::vector<int> > ChunkIndexMap;
}
diff --git a/inst/include/dplyr/DataFrameVisitorsIndexSet.h b/inst/include/dplyr/DataFrameVisitorsIndexSet.h
deleted file mode 100644
index e500157..0000000
--- a/inst/include/dplyr/DataFrameVisitorsIndexSet.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef dplyr_DataFrameVisitors_set_H
-#define dplyr_DataFrameVisitors_set_H
-
-namespace dplyr {
-
- typedef VisitorSetIndexSet<DataFrameVisitors> DataFrameVisitorsIndexSet ;
-
-}
-
-#endif
diff --git a/inst/include/dplyr/EmptySubset.h b/inst/include/dplyr/EmptySubset.h
index aa447fb..4da13a8 100644
--- a/inst/include/dplyr/EmptySubset.h
+++ b/inst/include/dplyr/EmptySubset.h
@@ -2,6 +2,11 @@
#define dplyr_EmptySubset_H
namespace dplyr {
- class EmptySubset{} ;
+class EmptySubset {
+public:
+ int size() const {
+ return 0;
+ }
+};
}
#endif
diff --git a/inst/include/dplyr/FullDataFrame.h b/inst/include/dplyr/FullDataFrame.h
index fa80b02..87d4a84 100644
--- a/inst/include/dplyr/FullDataFrame.h
+++ b/inst/include/dplyr/FullDataFrame.h
@@ -1,23 +1,25 @@
#ifndef dplyr_tools_FullDataFrame_H
#define dplyr_tools_FullDataFrame_H
-namespace Rcpp {
+namespace dplyr {
- class FullDataFrame {
- public:
- FullDataFrame( const DataFrame& data_ ) : index(0, data_.nrows() ) {}
+class FullDataFrame {
+public:
+ typedef NaturalSlicingIndex slicing_index;
- const SlicingIndex& get_index() const {
- return index ;
- }
+ FullDataFrame(const DataFrame& data_) : index(data_.nrows()) {}
- inline int nrows() const {
- return index.size() ;
- }
+ const SlicingIndex& get_index() const {
+ return index;
+ }
- private:
- SlicingIndex index ;
- } ;
+ inline int nrows() const {
+ return index.size();
+ }
+
+private:
+ NaturalSlicingIndex index;
+};
}
#endif
diff --git a/inst/include/dplyr/Gatherer.h b/inst/include/dplyr/Gatherer.h
index aaf1459..6ae948a 100644
--- a/inst/include/dplyr/Gatherer.h
+++ b/inst/include/dplyr/Gatherer.h
@@ -1,323 +1,280 @@
#ifndef dplyr_Gatherer_H
#define dplyr_Gatherer_H
+#include <tools/all_na.h>
+#include <tools/hash.h>
+#include <tools/utils.h>
+
+#include <dplyr/checks.h>
+
+#include <dplyr/Result/GroupedCallProxy.h>
+
+#include <dplyr/vector_class.h>
+#include <dplyr/checks.h>
+#include <dplyr/Collecter.h>
+#include <dplyr/bad.h>
+
namespace dplyr {
- class Gatherer {
- public:
- virtual ~Gatherer(){}
- virtual SEXP collect() = 0 ;
- } ;
-
- template <int RTYPE, typename Data, typename Subsets>
- class GathererImpl : public Gatherer {
- public:
- typedef typename traits::storage_type<RTYPE>::type STORAGE ;
- typedef GroupedCallProxy<Data,Subsets> Proxy ;
-
- GathererImpl( RObject& first, SlicingIndex& indices, Proxy& proxy_, const Data& gdf_, int first_non_na_ ) :
- gdf(gdf_), proxy(proxy_), data(gdf.nrows(), Vector<RTYPE>::get_na() ), first_non_na(first_non_na_)
- {
- if( first_non_na < gdf.ngroups() )
- grab( first, indices ) ;
- copy_most_attributes( data, first ) ;
- }
-
- SEXP collect(){
- int ngroups = gdf.ngroups() ;
- if( first_non_na == ngroups ) return data ;
- typename Data::group_iterator git = gdf.group_begin() ;
- int i = 0 ;
- for(; i<first_non_na; i++) ++git ;
- ++git; i++ ;
- for(; i<ngroups; i++, ++git){
- SlicingIndex indices = *git ;
- Shield<SEXP> subset( proxy.get( indices ) ) ;
- grab(subset, indices);
- }
- return data ;
- }
-
- private:
-
- inline void grab(SEXP subset, const SlicingIndex& indices){
- int n = Rf_length(subset) ;
- if( is<LogicalVector>(subset) && all(is_na(LogicalVector(subset))).is_true() ){
- grab_rep( Vector<RTYPE>::get_na(), indices ) ;
- } else {
- check_type(subset) ;
- if(n == indices.size() ){
- grab_along( subset, indices ) ;
- } else if( n == 1) {
- grab_rep( Rcpp::internal::r_vector_start<RTYPE>(subset)[0], indices ) ;
- } else {
- stop ( "incompatible size (%d), expecting %d (the group size) or 1",
- n, indices.size()) ;
- }
- }
- }
-
- void grab_along( SEXP subset, const SlicingIndex& indices ){
- int n = indices.size();
- STORAGE* ptr = Rcpp::internal::r_vector_start<RTYPE>( subset ) ;
- for( int j=0; j<n; j++){
- data[ indices[j] ] = ptr[j] ;
- }
- }
-
- void check_type(SEXP subset){
- if( TYPEOF(subset) != RTYPE ){
- stop( "incompatible types, expecting a %s vector", vector_class<RTYPE>() ) ;
- }
- }
-
- void grab_rep( STORAGE value, const SlicingIndex& indices ){
- int n = indices.size();
- for( int j=0; j<n; j++){
- data[ indices[j] ] = value ;
- }
- }
-
- const Data& gdf ;
- Proxy& proxy ;
- Vector<RTYPE> data ;
- int first_non_na ;
-
- } ;
-
- template <typename Data, typename Subsets>
- class ListGatherer : public Gatherer {
- public:
- typedef GroupedCallProxy<Data,Subsets> Proxy ;
-
- ListGatherer( List first, SlicingIndex& indices, Proxy& proxy_, const Data& gdf_, int first_non_na_ ) :
- gdf(gdf_), proxy(proxy_), data(gdf.nrows()), first_non_na(first_non_na_)
- {
- if( first_non_na < gdf.ngroups() ){
- perhaps_duplicate(first) ;
- grab( first, indices ) ;
- }
-
- copy_most_attributes( data, first ) ;
- }
-
- SEXP collect(){
- int ngroups = gdf.ngroups() ;
- if( first_non_na == ngroups ) return data ;
- typename Data::group_iterator git = gdf.group_begin() ;
- int i = 0 ;
- for(; i<first_non_na; i++) ++git ;
- ++git; i++ ;
- for(; i<ngroups; i++, ++git){
- SlicingIndex indices = *git ;
- List subset( proxy.get(indices) ) ;
- perhaps_duplicate(subset) ;
- grab(subset, indices);
- }
- return data ;
- }
-
- private:
-
- inline void perhaps_duplicate( List& x ){
- int n = x.size() ;
- for( int i=0; i<n; i++){
- SEXP xi = x[i] ;
- if( IS_DPLYR_SHRINKABLE_VECTOR(xi) ) {
- x[i] = Rf_duplicate(xi) ;
- } else if( TYPEOF(xi) == VECSXP ){
- List lxi(xi) ;
- perhaps_duplicate( lxi ) ;
- }
- }
- }
-
- inline void grab(const List& subset, const SlicingIndex& indices){
- int n = subset.size() ;
-
- if(n == indices.size() ){
- grab_along( subset, indices ) ;
- } else if( n == 1) {
- grab_rep( subset[0], indices ) ;
- } else {
- stop ( "incompatible size (%d), expecting %d (the group size) or 1",
- n, indices.size()) ;
- }
- }
-
- void grab_along( const List& subset, const SlicingIndex& indices ){
- int n = indices.size();
- for( int j=0; j<n; j++){
- data[ indices[j] ] = subset[j] ;
- }
- }
-
- void grab_rep( SEXP value, const SlicingIndex& indices ){
- int n = indices.size();
- for( int j=0; j<n; j++){
- data[ indices[j] ] = value ;
- }
- }
-
- const Data& gdf ;
- Proxy& proxy ;
- List data ;
- int first_non_na ;
-
- } ;
-
- template <typename Data, typename Subsets>
- class FactorGatherer : public Gatherer {
- public:
- typedef GroupedCallProxy<Data,Subsets> Proxy ;
- typedef IntegerVector Factor;
-
- FactorGatherer( RObject& first, SlicingIndex& indices, Proxy& proxy_, const Data& gdf_, int first_non_na_ ) :
- levels(), data(gdf_.nrows(), NA_INTEGER), first_non_na(first_non_na_), proxy(proxy_), gdf(gdf_)
- {
- if( first_non_na < gdf.ngroups() )
- grab( (SEXP)first, indices ) ;
- copy_most_attributes( data, first ) ;
- }
-
- inline SEXP collect(){
- int ngroups = gdf.ngroups() ;
- typename Data::group_iterator git = gdf.group_begin() ;
- int i = 0 ;
- for(; i<first_non_na; i++) ++git ;
- for(; i<ngroups; i++, ++git){
- SlicingIndex indices = *git ;
- Factor subset( proxy.get( indices ) ) ;
- grab(subset, indices);
- }
- CharacterVector levels_(levels_vector.begin(), levels_vector.end() ) ;
- data.attr("levels") = levels_ ;
- return data ;
- }
-
- private:
- dplyr_hash_map<SEXP, int> levels ;
- Factor data ;
- int first_non_na ;
- Proxy& proxy ;
- const Data& gdf ;
- std::vector<SEXP> levels_vector ;
-
- void grab( Factor f, const SlicingIndex& indices ){
- // update levels if needed
- CharacterVector lev = f.attr("levels") ;
- std::vector<int> matches( lev.size() ) ;
- int nlevels = levels.size() ;
- for( int i=0; i<lev.size(); i++){
- SEXP level = lev[i] ;
- if( !levels.count(level) ){
- nlevels++ ;
- levels_vector.push_back(level) ;
- levels[level] = nlevels ;
- matches[i] = nlevels ;
- } else {
- matches[i] = levels[level] ;
- }
- }
-
- // grab data
- int n = indices.size() ;
-
- int nf = f.size() ;
- if( n == nf ){
- for( int i=0; i<n; i++){
- if( f[i] != NA_INTEGER ){
- data[ indices[i] ] = matches[ f[i] - 1 ] ;
- }
- }
- } else if( nf == 1){
- int value = NA_INTEGER ;
- if( f[0] != NA_INTEGER ){
- value = matches[ f[0] - 1] ;
- for( int i=0; i<n; i++){
- data[ indices[i] ] = value ;
- }
- }
- } else {
- stop( "incompatible size" ) ;
- }
- }
-
-
- } ;
-
- template <int RTYPE>
- class ConstantGathererImpl : public Gatherer {
- public:
- ConstantGathererImpl( Vector<RTYPE> constant, int n ) :
- value( n, Rcpp::internal::r_vector_start<RTYPE>(constant)[0] )
- {
- copy_most_attributes( value, constant ) ;
- }
-
- inline SEXP collect() {
- return value ;
- }
-
- private:
- Vector<RTYPE> value ;
- } ;
-
- inline Gatherer* constant_gatherer(SEXP x, int n){
- if( Rf_inherits(x, "POSIXlt" ) ){
- stop("`mutate` does not support `POSIXlt` results");
- }
- switch( TYPEOF(x) ){
- case INTSXP: return new ConstantGathererImpl<INTSXP>( x, n ) ;
- case REALSXP: return new ConstantGathererImpl<REALSXP>( x, n ) ;
- case LGLSXP: return new ConstantGathererImpl<LGLSXP>( x, n ) ;
- case STRSXP: return new ConstantGathererImpl<STRSXP>( x, n ) ;
- case CPLXSXP: return new ConstantGathererImpl<CPLXSXP>( x, n ) ;
- case VECSXP: return new ConstantGathererImpl<STRSXP>( x, n ) ;
- default: break ;
- }
- stop("Unsupported vector type %s", Rf_type2char(TYPEOF(x))) ;
- return 0 ;
+class Gatherer {
+public:
+ virtual ~Gatherer() {}
+ virtual SEXP collect() = 0;
+};
+
+template <typename Data, typename Subsets>
+class GathererImpl : public Gatherer {
+public:
+ typedef GroupedCallProxy<Data, Subsets> Proxy;
+
+ GathererImpl(RObject& first, SlicingIndex& indices, Proxy& proxy_, const Data& gdf_, int first_non_na_, const SymbolString& name_) :
+ gdf(gdf_), proxy(proxy_), first_non_na(first_non_na_), name(name_)
+ {
+ coll = collecter(first, gdf.nrows());
+ if (first_non_na < gdf.ngroups())
+ grab(first, indices);
+ }
+
+ ~GathererImpl() {
+ if (coll != 0) {
+ delete coll;
+ }
+ }
+
+ SEXP collect() {
+ int ngroups = gdf.ngroups();
+ if (first_non_na == ngroups) return coll->get();
+ typename Data::group_iterator git = gdf.group_begin();
+ int i = 0;
+ for (; i < first_non_na; i++) ++git;
+ ++git;
+ i++;
+ for (; i < ngroups; i++, ++git) {
+ const SlicingIndex& indices = *git;
+ Shield<SEXP> subset(proxy.get(indices));
+ grab(subset, indices);
+ }
+ return coll->get();
+ }
+
+private:
+
+ inline void grab(SEXP subset, const SlicingIndex& indices) {
+ int n = Rf_length(subset);
+ if (n == indices.size()) {
+ grab_along(subset, indices);
+ } else if (n == 1) {
+ grab_rep(subset, indices);
+ } else if (Rf_isNull(subset)) {
+ stop("incompatible types (NULL), expecting %s", coll->describe());
+ } else {
+ check_length(n, indices.size(), "the group size", name);
}
- template <typename Data, typename Subsets>
- inline Gatherer* gatherer( GroupedCallProxy<Data,Subsets>& proxy, const Data& gdf, SEXP name ){
- typename Data::group_iterator git = gdf.group_begin() ;
- SlicingIndex indices = *git ;
- RObject first( proxy.get(indices) ) ;
-
- if( Rf_inherits(first, "POSIXlt" ) ){
- stop("`mutate` does not support `POSIXlt` results");
- }
- int ng = gdf.ngroups() ;
- int i = 0 ;
- while( all_na(first) ){
- i++ ; if( i == ng ) break ;
- ++git ;
- indices = *git ;
- first = proxy.get(indices) ;
- }
-
- switch( TYPEOF(first) ){
- case INTSXP:
- {
- if( Rf_inherits(first, "factor"))
- return new FactorGatherer<Data, Subsets>( first, indices, proxy, gdf, i) ;
- return new GathererImpl<INTSXP,Data,Subsets> ( first, indices, proxy, gdf, i ) ;
- }
- case REALSXP: return new GathererImpl<REALSXP,Data,Subsets> ( first, indices, proxy, gdf, i ) ;
- case LGLSXP: return new GathererImpl<LGLSXP,Data,Subsets> ( first, indices, proxy, gdf, i ) ;
- case STRSXP: return new GathererImpl<STRSXP,Data,Subsets> ( first, indices, proxy, gdf, i ) ;
- case VECSXP: return new ListGatherer<Data,Subsets> ( List(first), indices, proxy, gdf, i ) ;
- case CPLXSXP: return new GathererImpl<CPLXSXP,Data,Subsets> ( first, indices, proxy, gdf, i ) ;
- default: break ;
- }
-
- check_supported_type(first, name);
- return 0;
+ }
+
+ void grab_along(SEXP subset, const SlicingIndex& indices) {
+ if (coll->compatible(subset)) {
+ // if the current source is compatible, collect
+ coll->collect(indices, subset);
+ } else if (coll->can_promote(subset)) {
+ // setup a new Collecter
+ Collecter* new_collecter = promote_collecter(subset, gdf.nrows(), coll);
+
+ // import data from previous collecter.
+ new_collecter->collect(NaturalSlicingIndex(gdf.nrows()), coll->get());
+
+ // import data from this chunk
+ new_collecter->collect(indices, subset);
+
+ // dispose the previous collecter and keep the new one.
+ delete coll;
+ coll = new_collecter;
+ } else if (coll->is_logical_all_na()) {
+ Collecter* new_collecter = collecter(subset, gdf.nrows());
+ new_collecter->collect(indices, subset);
+ delete coll;
+ coll = new_collecter;
+ } else {
+ bad_col(name, "can't be converted from {source_type} to {target_type}",
+ _["source_type"] = coll->describe(), _["target_type"] = get_single_class(subset));
+ }
+ }
+
+ void grab_rep(SEXP value, const SlicingIndex& indices) {
+ int n = indices.size();
+ // FIXME: This can be made faster if `source` in `Collecter->collect(source, indices)`
+ // could be of length 1 recycling the value.
+ // TODO: create Collecter->collect_one(source, indices)?
+ for (int j = 0; j < n; j++) {
+ grab_along(value, RowwiseSlicingIndex(indices[j]));
+ }
+ }
+
+ const Data& gdf;
+ Proxy& proxy;
+ Collecter* coll;
+ int first_non_na;
+ const SymbolString& name;
+
+};
+
+template <typename Data, typename Subsets>
+class ListGatherer : public Gatherer {
+public:
+ typedef GroupedCallProxy<Data, Subsets> Proxy;
+
+ ListGatherer(List first, SlicingIndex& indices, Proxy& proxy_, const Data& gdf_, int first_non_na_, const SymbolString& name_) :
+ gdf(gdf_), proxy(proxy_), data(gdf.nrows()), first_non_na(first_non_na_), name(name_)
+ {
+ if (first_non_na < gdf.ngroups()) {
+ perhaps_duplicate(first);
+ grab(first, indices);
}
+ copy_most_attributes(data, first);
+ }
+
+ SEXP collect() {
+ int ngroups = gdf.ngroups();
+ if (first_non_na == ngroups) return data;
+ typename Data::group_iterator git = gdf.group_begin();
+ int i = 0;
+ for (; i < first_non_na; i++) ++git;
+ ++git;
+ i++;
+ for (; i < ngroups; i++, ++git) {
+ const SlicingIndex& indices = *git;
+ List subset(proxy.get(indices));
+ perhaps_duplicate(subset);
+ grab(subset, indices);
+ }
+ return data;
+ }
+
+private:
+
+ inline void perhaps_duplicate(List& x) {
+ int n = x.size();
+ for (int i = 0; i < n; i++) {
+ SEXP xi = x[i];
+ if (IS_DPLYR_SHRINKABLE_VECTOR(xi)) {
+ x[i] = Rf_duplicate(xi);
+ } else if (TYPEOF(xi) == VECSXP) {
+ List lxi(xi);
+ perhaps_duplicate(lxi);
+ }
+ }
+ }
+
+ inline void grab(const List& subset, const SlicingIndex& indices) {
+ int n = subset.size();
+
+ if (n == indices.size()) {
+ grab_along(subset, indices);
+ } else if (n == 1) {
+ grab_rep(subset[0], indices);
+ } else {
+ check_length(n, indices.size(), "the group size", name);
+ }
+ }
+
+ void grab_along(const List& subset, const SlicingIndex& indices) {
+ int n = indices.size();
+ for (int j = 0; j < n; j++) {
+ data[ indices[j] ] = subset[j];
+ }
+ }
+
+ void grab_rep(SEXP value, const SlicingIndex& indices) {
+ int n = indices.size();
+ for (int j = 0; j < n; j++) {
+ data[ indices[j] ] = value;
+ }
+ }
+
+ const Data& gdf;
+ Proxy& proxy;
+ List data;
+ int first_non_na;
+ const SymbolString name;
+
+};
+
+
+template <int RTYPE>
+class ConstantGathererImpl : public Gatherer {
+public:
+ ConstantGathererImpl(Vector<RTYPE> constant, int n) :
+ value(n, Rcpp::internal::r_vector_start<RTYPE>(constant)[0])
+ {
+ copy_most_attributes(value, constant);
+ }
+
+ inline SEXP collect() {
+ return value;
+ }
+
+private:
+ Vector<RTYPE> value;
+};
+
+inline Gatherer* constant_gatherer(SEXP x, int n, const SymbolString& name) {
+ if (Rf_inherits(x, "POSIXlt")) {
+ bad_col(name, "is of unsupported class POSIXlt");
+ }
+ switch (TYPEOF(x)) {
+ case INTSXP:
+ return new ConstantGathererImpl<INTSXP>(x, n);
+ case REALSXP:
+ return new ConstantGathererImpl<REALSXP>(x, n);
+ case LGLSXP:
+ return new ConstantGathererImpl<LGLSXP>(x, n);
+ case STRSXP:
+ return new ConstantGathererImpl<STRSXP>(x, n);
+ case CPLXSXP:
+ return new ConstantGathererImpl<CPLXSXP>(x, n);
+ case VECSXP:
+ return new ConstantGathererImpl<VECSXP>(x, n);
+ default:
+ break;
+ }
+ bad_col(name, "is of unsupported type {type}", _["type"] = Rf_type2char(TYPEOF(x)));
+}
+
+template <typename Data, typename Subsets>
+inline Gatherer* gatherer(GroupedCallProxy<Data, Subsets>& proxy, const Data& gdf, const SymbolString& name) {
+ typename Data::group_iterator git = gdf.group_begin();
+ typename Data::slicing_index indices = *git;
+ RObject first(proxy.get(indices));
+
+ if (Rf_inherits(first, "POSIXlt")) {
+ bad_col(name, "is of unsupported class POSIXlt");
+ }
+
+ check_supported_type(first, name);
+ check_length(Rf_length(first), indices.size(), "the group size", name);
+
+ const int ng = gdf.ngroups();
+ int i = 0;
+ while (all_na(first)) {
+ i++;
+ if (i == ng) break;
+ ++git;
+ indices = *git;
+ first = proxy.get(indices);
+ }
+
+
+ if (TYPEOF(first) == VECSXP) {
+ return new ListGatherer<Data, Subsets> (List(first), indices, proxy, gdf, i, name);
+ } else {
+ return new GathererImpl<Data, Subsets> (first, indices, proxy, gdf, i, name);
+ }
+}
+
} // namespace dplyr
#endif
+
diff --git a/inst/include/dplyr/GroupedDataFrame.h b/inst/include/dplyr/GroupedDataFrame.h
index d42f22f..8c13a7b 100644
--- a/inst/include/dplyr/GroupedDataFrame.h
+++ b/inst/include/dplyr/GroupedDataFrame.h
@@ -1,150 +1,163 @@
#ifndef dplyr_tools_GroupedDataFrame_H
#define dplyr_tools_GroupedDataFrame_H
-namespace Rcpp {
-
- inline void check_valid_colnames( const DataFrame& df){
- if( df.size() ){
- CharacterVector names(df.names()) ;
- LogicalVector dup = duplicated(names) ;
- if( any(dup).is_true() ){
- std::stringstream s ;
- s << "found duplicated column name: " ;
- bool first = true ;
- for( int i=0; i<df.size(); i++){
- if( dup[i] == TRUE ){
- if( first ){
- first = false ;
- } else {
- s << ", " ;
- }
- s << names[i] ;
- }
- }
- stop(s.str()) ;
- }
+#include <dplyr/registration.h>
+#include <tools/SlicingIndex.h>
+
+#include <dplyr/Result/GroupedSubset.h>
+
+#include <tools/SymbolVector.h>
+#include <tools/SymbolMap.h>
+
+#include <dplyr/bad.h>
+
+namespace dplyr {
+
+inline void check_valid_colnames(const DataFrame& df) {
+ if (df.size()) {
+ CharacterVector names(df.names());
+ LogicalVector dup = duplicated(names);
+ if (any(dup).is_true()) {
+ std::stringstream s;
+ s << "found duplicated column name: ";
+ bool first = true;
+ for (int i = 0; i < df.size(); i++) {
+ if (dup[i] == TRUE) {
+ if (first) {
+ first = false;
+ } else {
+ s << ", ";
+ }
+ s << names[i];
}
+ }
+ stop(s.str());
}
+ }
+}
- class GroupedDataFrame ;
-
- class GroupedDataFrameIndexIterator {
- public:
- GroupedDataFrameIndexIterator( const GroupedDataFrame& gdf_ ) ;
-
- GroupedDataFrameIndexIterator& operator++() ;
-
- SlicingIndex operator*() const ;
-
- int i ;
- const GroupedDataFrame& gdf ;
- List indices ;
- } ;
-
- class GroupedDataFrame {
- public:
- typedef GroupedDataFrameIndexIterator group_iterator ;
- GroupedDataFrame( SEXP x):
- data_(x),
- group_sizes(),
- biggest_group_size(0),
- symbols( data_.attr("vars") ),
- labels()
- {
- // handle lazyness
- bool is_lazy = Rf_isNull( data_.attr( "group_sizes") ) || Rf_isNull( data_.attr( "labels") ) ;
-
- if( is_lazy ){
- data_ = build_index_cpp( data_) ;
- }
- group_sizes = data_.attr( "group_sizes" );
- biggest_group_size = data_.attr( "biggest_group_size" ) ;
- labels = data_.attr( "labels" );
-
- if( !is_lazy ){
- // check consistency of the groups
- int rows_in_groups = sum(group_sizes) ;
- if( data_.nrows() != rows_in_groups ){
- stop( "corrupt 'grouped_df', contains %d rows, and %s rows in groups", data_.nrows(), rows_in_groups );
- }
- }
- }
+class GroupedDataFrame;
- group_iterator group_begin() const {
- return GroupedDataFrameIndexIterator( *this ) ;
- }
+class GroupedDataFrameIndexIterator {
+public:
+ GroupedDataFrameIndexIterator(const GroupedDataFrame& gdf_);
- SEXP symbol( int i) const {
- return symbols[i] ;
- }
+ GroupedDataFrameIndexIterator& operator++();
- DataFrame& data() {
- return data_ ;
- }
- const DataFrame& data() const {
- return data_ ;
- }
+ GroupedSlicingIndex operator*() const;
- inline int ngroups() const {
- return group_sizes.size() ;
- }
+ int i;
+ const GroupedDataFrame& gdf;
+ List indices;
+};
- inline int nvars() const {
- return labels.size() ;
- }
+class GroupedDataFrame {
+public:
+ typedef GroupedDataFrameIndexIterator group_iterator;
+ typedef GroupedSlicingIndex slicing_index;
+ typedef GroupedSubset subset;
- inline int nrows() const {
- return data_.nrows() ;
- }
+ GroupedDataFrame(SEXP x):
+ data_(x),
+ group_sizes(),
+ biggest_group_size(0),
+ symbols(get_vars(data_)),
+ labels()
+ {
+ // handle lazyness
+ bool is_lazy = Rf_isNull(data_.attr("group_sizes")) || Rf_isNull(data_.attr("labels"));
- inline SEXP label(int i) const {
- return labels[i];
- }
+ if (is_lazy) {
+ data_ = build_index_cpp(data_);
+ }
+ group_sizes = data_.attr("group_sizes");
+ biggest_group_size = data_.attr("biggest_group_size");
+ labels = data_.attr("labels");
+
+ if (!is_lazy) {
+ // check consistency of the groups
+ int rows_in_groups = sum(group_sizes);
+ if (data_.nrows() != rows_in_groups) {
+ bad_arg(".data", "is a corrupt grouped_df, contains {rows} rows, and {group_rows} rows in groups",
+ _["rows"] = data_.nrows(), _["group_rows"] = rows_in_groups);
+ }
+ }
+ }
- inline int max_group_size() const{
- return biggest_group_size ;
- }
+ group_iterator group_begin() const {
+ return GroupedDataFrameIndexIterator(*this);
+ }
- inline bool has_group(SEXP g) const {
- SEXP symb = Rf_installChar(g) ;
- int n = symbols.size() ;
- for( int i=0; i<n; i++){
- if( symbols[i] == symb ) return true ;
- }
- return false ;
- }
+ SymbolString symbol(int i) const {
+ return symbols.get_name(i);
+ }
- inline const IntegerVector& get_group_sizes() const {
- return group_sizes ;
- }
+ DataFrame& data() {
+ return data_;
+ }
+ const DataFrame& data() const {
+ return data_;
+ }
- private:
+ inline int ngroups() const {
+ return group_sizes.size();
+ }
- DataFrame data_ ;
- IntegerVector group_sizes ;
- int biggest_group_size ;
- ListOf<Symbol> symbols ;
- DataFrame labels ;
+ inline int nvars() const {
+ return labels.size();
+ }
- } ;
+ inline int nrows() const {
+ return data_.nrows();
+ }
- template <>
- inline bool is<GroupedDataFrame>( SEXP x){
- return Rf_inherits(x, "grouped_df" ) && Rf_getAttrib(x, Rf_install("vars") ) != R_NilValue ;
- }
+ inline SEXP label(int i) const {
+ return labels[i];
+ }
- inline GroupedDataFrameIndexIterator::GroupedDataFrameIndexIterator( const GroupedDataFrame& gdf_ ) :
- i(0), gdf(gdf_), indices(gdf.data().attr("indices")) {}
+ inline int max_group_size() const {
+ return biggest_group_size;
+ }
- inline GroupedDataFrameIndexIterator& GroupedDataFrameIndexIterator::operator++(){
- i++;
- return *this ;
- }
+ inline bool has_group(const SymbolString& g) const {
+ return symbols.has(g);
+ }
- inline SlicingIndex GroupedDataFrameIndexIterator::operator*() const {
- return SlicingIndex( IntegerVector(indices[i]), i ) ;
- }
+ inline subset* create_subset(SEXP x) const {
+ return grouped_subset(x, max_group_size());
+ }
+
+private:
+
+ DataFrame data_;
+ IntegerVector group_sizes;
+ int biggest_group_size;
+ SymbolMap symbols;
+ DataFrame labels;
+
+};
+
+inline GroupedDataFrameIndexIterator::GroupedDataFrameIndexIterator(const GroupedDataFrame& gdf_) :
+ i(0), gdf(gdf_), indices(gdf.data().attr("indices")) {}
+
+inline GroupedDataFrameIndexIterator& GroupedDataFrameIndexIterator::operator++() {
+ i++;
+ return *this;
+}
+
+inline GroupedSlicingIndex GroupedDataFrameIndexIterator::operator*() const {
+ return GroupedSlicingIndex(IntegerVector(indices[i]), i);
+}
+
+}
+
+namespace Rcpp {
+using namespace dplyr;
+template <>
+inline bool is<GroupedDataFrame>(SEXP x) {
+ return Rf_inherits(x, "grouped_df") && Rf_getAttrib(x, Rf_install("vars")) != R_NilValue;
+}
}
diff --git a/inst/include/dplyr/Groups.h b/inst/include/dplyr/Groups.h
new file mode 100644
index 0000000..64c78a1
--- /dev/null
+++ b/inst/include/dplyr/Groups.h
@@ -0,0 +1,16 @@
+#ifndef dplyr_dplyr_Groups_H
+#define dplyr_dplyr_Groups_H
+
+#include <tools/Quosure.h>
+
+#include <dplyr/GroupedDataFrame.h>
+#include <dplyr/RowwiseDataFrame.h>
+
+
+void check_not_groups(const QuosureList& quosures, const GroupedDataFrame& gdf);
+void check_not_groups(const QuosureList& quosures, const RowwiseDataFrame& gdf);
+
+SEXP strip_group_attributes(SEXP df);
+
+
+#endif // #ifndef dplyr_dplyr_Groups_H
diff --git a/inst/include/dplyr/Hybrid.h b/inst/include/dplyr/Hybrid.h
new file mode 100644
index 0000000..a8e72ba
--- /dev/null
+++ b/inst/include/dplyr/Hybrid.h
@@ -0,0 +1,12 @@
+#ifndef dplyr_dplyr_Hybrid_H
+#define dplyr_dplyr_Hybrid_H
+
+namespace dplyr {
+class ILazySubsets;
+class Result;
+
+Result* get_handler(SEXP, const ILazySubsets&, const Environment&);
+
+}
+
+#endif // dplyr_dplyr_Hybrid_H
diff --git a/inst/include/dplyr/HybridHandler.h b/inst/include/dplyr/HybridHandler.h
new file mode 100644
index 0000000..7303755
--- /dev/null
+++ b/inst/include/dplyr/HybridHandler.h
@@ -0,0 +1,11 @@
+#ifndef dplyr_dplyr_HybridHandler_H
+#define dplyr_dplyr_HybridHandler_H
+
+namespace dplyr {
+class ILazySubsets;
+class Result;
+}
+
+typedef dplyr::Result* (*HybridHandler)(SEXP, const dplyr::ILazySubsets&, int);
+
+#endif // dplyr_dplyr_HybridHandlerMap_H
diff --git a/inst/include/dplyr/HybridHandlerMap.h b/inst/include/dplyr/HybridHandlerMap.h
new file mode 100644
index 0000000..d6862d0
--- /dev/null
+++ b/inst/include/dplyr/HybridHandlerMap.h
@@ -0,0 +1,20 @@
+#ifndef dplyr_dplyr_HybridHandlerMap_H
+#define dplyr_dplyr_HybridHandlerMap_H
+
+#include <tools/hash.h>
+#include <dplyr/HybridHandler.h>
+
+typedef dplyr_hash_map<SEXP, HybridHandler> HybridHandlerMap;
+
+void install_simple_handlers(HybridHandlerMap& handlers);
+void install_minmax_handlers(HybridHandlerMap& handlers);
+void install_count_handlers(HybridHandlerMap& handlers);
+void install_nth_handlers(HybridHandlerMap& handlers);
+void install_window_handlers(HybridHandlerMap& handlers);
+void install_offset_handlers(HybridHandlerMap& handlers);
+void install_in_handlers(HybridHandlerMap& handlers);
+void install_debug_handlers(HybridHandlerMap& handlers);
+
+bool hybridable(RObject arg);
+
+#endif // dplyr_dplyr_HybridHandlerMap_H
diff --git a/inst/include/dplyr/JoinVisitor.h b/inst/include/dplyr/JoinVisitor.h
index 5312ab1..dd98025 100644
--- a/inst/include/dplyr/JoinVisitor.h
+++ b/inst/include/dplyr/JoinVisitor.h
@@ -1,19 +1,26 @@
#ifndef dplyr_JoinVisitor_H
#define dplyr_JoinVisitor_H
-namespace dplyr{
+#include <dplyr/Column.h>
+#include <dplyr/visitor_set/VisitorSetIndexSet.h>
- class JoinVisitor{
- public:
- virtual ~JoinVisitor(){}
+namespace dplyr {
- virtual size_t hash(int i) = 0 ;
- virtual bool equal(int i, int j) = 0 ;
+class DataFrameJoinVisitors;
- virtual SEXP subset( const std::vector<int>& indices ) = 0;
- virtual SEXP subset( const VisitorSetIndexSet<DataFrameJoinVisitors>& set ) = 0;
+class JoinVisitor {
+public:
+ virtual ~JoinVisitor() {}
- } ;
+ virtual size_t hash(int i) = 0;
+ virtual bool equal(int i, int j) = 0;
+
+ virtual SEXP subset(const std::vector<int>& indices) = 0;
+ virtual SEXP subset(const VisitorSetIndexSet<DataFrameJoinVisitors>& set) = 0;
+
+};
+
+JoinVisitor* join_visitor(const Column& left, const Column& right, bool warn, bool accept_na_match = true);
}
diff --git a/inst/include/dplyr/JoinVisitorImpl.h b/inst/include/dplyr/JoinVisitorImpl.h
index a4d50fc..34c62b2 100644
--- a/inst/include/dplyr/JoinVisitorImpl.h
+++ b/inst/include/dplyr/JoinVisitorImpl.h
@@ -1,487 +1,261 @@
#ifndef dplyr_JoinVisitorImpl_H
#define dplyr_JoinVisitorImpl_H
-namespace dplyr{
-
- CharacterVector get_uniques( const CharacterVector& left, const CharacterVector& right) ;
-
- template <int LHS_RTYPE, int RHS_RTYPE>
- class JoinVisitorImpl : public JoinVisitor, public comparisons_different<LHS_RTYPE, RHS_RTYPE>{
- public:
- typedef Vector<LHS_RTYPE> LHS_Vec ;
- typedef Vector<RHS_RTYPE> RHS_Vec ;
-
- typedef typename Rcpp::traits::storage_type<LHS_RTYPE>::type LHS_STORAGE ;
- typedef typename Rcpp::traits::storage_type<RHS_RTYPE>::type RHS_STORAGE ;
-
- typedef boost::hash<LHS_STORAGE> LHS_hasher ;
- typedef boost::hash<RHS_STORAGE> RHS_hasher ;
-
- JoinVisitorImpl( LHS_Vec left_, RHS_Vec right_ ) : left(left_), right(right_){
- check_attribute_compatibility(left, right) ;
- }
-
- size_t hash(int i) ;
-
- inline bool equal( int i, int j) {
- if( i>=0 && j>=0 ) {
- return comparisons<LHS_RTYPE>().equal_or_both_na( left[i], left[j] ) ;
- } else if( i < 0 && j < 0 ) {
- return comparisons<LHS_RTYPE>().equal_or_both_na( right[-i-1], right[-j-1] ) ;
- } else if( i >= 0 && j < 0) {
- return comparisons_different<LHS_RTYPE,RHS_RTYPE>().equal_or_both_na( left[i], right[-j-1] ) ;
- } else {
- return comparisons_different<RHS_RTYPE,LHS_RTYPE>().equal_or_both_na( right[-i-1], left[j] ) ;
- }
- }
-
- inline SEXP subset( const std::vector<int>& indices );
- inline SEXP subset( const VisitorSetIndexSet<DataFrameJoinVisitors>& set ) ;
-
- LHS_Vec left ;
- RHS_Vec right ;
- LHS_hasher LHS_hash_fun ;
- RHS_hasher RHS_hash_fun ;
-
- } ;
-
- template <typename Visitor>
- class Subsetter {
- public:
- typedef typename Visitor::Vec Vec ;
-
- Subsetter( const Visitor& v_) : v(v_){} ;
-
- inline SEXP subset( const std::vector<int>& indices ) {
- int n = indices.size() ;
- Vec res = no_init(n) ;
- for( int i=0; i<n; i++) {
- res[i] = v.get(indices[i]) ;
- }
- return res ;
- }
-
- inline SEXP subset( const VisitorSetIndexSet<DataFrameJoinVisitors>& set ) {
- int n = set.size() ;
- Vec res = no_init(n) ;
- VisitorSetIndexSet<DataFrameJoinVisitors>::const_iterator it=set.begin() ;
- for( int i=0; i<n; i++, ++it) {
- res[i] = v.get(*it) ;
- }
- return res ;
- }
- private:
- const Visitor& v ;
- } ;
-
- template <int RTYPE>
- class JoinVisitorImpl<RTYPE,RTYPE> : public JoinVisitor, public comparisons<RTYPE>{
- public:
- typedef comparisons<RTYPE> Compare ;
-
- typedef Vector<RTYPE> Vec ;
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
- typedef boost::hash<STORAGE> hasher ;
-
- JoinVisitorImpl( Vec left_, Vec right_ ) : left(left_), right(right_){}
-
- inline size_t hash(int i){
- return hash_fun( get(i) ) ;
- }
-
- inline bool equal( int i, int j) {
- return Compare::equal_or_both_na(
- get(i), get(j)
- ) ;
- }
-
- inline SEXP subset( const std::vector<int>& indices ) {
- RObject res = Subsetter<JoinVisitorImpl>(*this).subset(indices ) ;
- copy_most_attributes(res, left) ;
- return res ;
- }
-
- inline SEXP subset( const VisitorSetIndexSet<DataFrameJoinVisitors>& set ) {
- RObject res = Subsetter<JoinVisitorImpl>(*this).subset(set) ;
- copy_most_attributes(res, left) ;
- return res ;
- }
-
- inline STORAGE get(int i) const {
- return i >= 0 ? left[i] : right[-i-1] ;
- }
-
- protected:
- Vec left, right ;
- hasher hash_fun ;
-
- } ;
-
- class JoinFactorFactorVisitor : public JoinVisitor {
- public:
- typedef CharacterVector Vec ;
-
- JoinFactorFactorVisitor( const IntegerVector& left_, const IntegerVector& right_ ) :
- left(left_),
- right(right_),
- left_levels (left.attr("levels")),
- right_levels(right.attr("levels")),
- uniques( get_uniques(left_levels, right_levels) ),
- left_match ( match( left_levels, uniques) ),
- right_match( match( right_levels, uniques) )
- {}
-
- inline size_t hash(int i){
- return hash_fun( get(i) ) ;
- }
-
- inline bool equal( int i, int j){
- return get(i) == get(j) ;
- }
-
- inline SEXP subset( const std::vector<int>& indices ) {
- return Subsetter<JoinFactorFactorVisitor>(*this).subset(indices) ;
- }
-
- inline SEXP subset( const VisitorSetIndexSet<DataFrameJoinVisitors>& set ) {
- return Subsetter<JoinFactorFactorVisitor>(*this).subset(set) ;
- }
-
- inline SEXP get(int i) const {
- if( i >= 0){
- int pos = left[i] ;
- return (pos == NA_INTEGER) ? NA_STRING : SEXP( uniques[left_match[pos-1] - 1] ) ;
- } else {
- int pos = right[-i-1] ;
- return (pos == NA_INTEGER) ? NA_STRING : SEXP( uniques[right_match[ pos -1 ] - 1] ) ;
- }
- }
-
- private:
- IntegerVector left, right ;
- CharacterVector left_levels, right_levels ;
- CharacterVector uniques ;
- IntegerVector left_match, right_match ;
- boost::hash<SEXP> hash_fun ;
-
- } ;
-
- class JoinStringStringVisitor : public JoinVisitor {
- public:
- typedef CharacterVector Vec ;
-
- JoinStringStringVisitor( CharacterVector left_, CharacterVector right) :
- left(left_),
- uniques( get_uniques(left, right) ),
- i_left( match(left, uniques) ),
- i_right( match(right, uniques) ),
- int_visitor( i_left, i_right),
- p_uniques( internal::r_vector_start<STRSXP>(uniques) ),
- p_left( i_left.begin() ),
- p_right( i_right.begin() )
- {}
-
- inline size_t hash(int i) {
- return int_visitor.hash(i) ;
- }
- bool equal(int i, int j) {
- return int_visitor.equal(i,j) ;
- }
-
- inline SEXP subset( const std::vector<int>& indices ) {
- RObject res = Subsetter<JoinStringStringVisitor>(*this).subset(indices) ;
- copy_most_attributes( res, left) ;
- return res ;
- }
-
- inline SEXP subset( const VisitorSetIndexSet<DataFrameJoinVisitors>& set ) {
- RObject res = Subsetter<JoinStringStringVisitor>(*this).subset(set) ;
- copy_most_attributes(res, left) ;
- return res ;
- }
-
- inline SEXP get(int i) const {
- if( i >= 0 ){
- return ( i_left[i] == NA_INTEGER ) ? NA_STRING : p_uniques[ p_left[i] - 1] ;
- } else {
- return ( i_right[-i-1] == NA_INTEGER ) ? NA_STRING : p_uniques[ p_right[-i-1] - 1] ;
- }
- }
-
- private:
- CharacterVector left ;
- CharacterVector uniques ;
- IntegerVector i_left, i_right ;
- JoinVisitorImpl<INTSXP,INTSXP> int_visitor ;
- SEXP* p_uniques ;
- int* p_left ;
- int* p_right ;
-
-
- } ;
-
- class JoinFactorStringVisitor : public JoinVisitor {
- public:
- typedef CharacterVector Vec ;
-
- JoinFactorStringVisitor( const IntegerVector& left_, const CharacterVector& right_ ) :
- left(left_),
- left_ptr(left.begin()),
-
- right(right_),
- uniques( get_uniques(left.attr("levels"), right) ) ,
- p_uniques( internal::r_vector_start<STRSXP>(uniques) ),
-
- i_right( match( right, uniques )),
- int_visitor( left, i_right )
-
- {}
-
- inline size_t hash(int i){
- return int_visitor.hash(i) ;
- }
-
- inline bool equal( int i, int j){
- return int_visitor.equal(i,j) ;
- }
-
- inline SEXP subset( const std::vector<int>& indices ) {
- RObject res = Subsetter<JoinFactorStringVisitor>(*this).subset(indices) ;
- // copy_most_attributes(res, left) ;
- return res ;
- }
-
- inline SEXP subset( const VisitorSetIndexSet<DataFrameJoinVisitors>& set ) {
- RObject res = Subsetter<JoinFactorStringVisitor>(*this).subset(set) ;
- // copy_most_attributes(res, left) ;
- return res ;
- }
-
- inline SEXP get(int i) const {
- if( i>=0 ){
- if( left_ptr[i] == NA_INTEGER ) return NA_STRING ;
- return p_uniques[ left_ptr[i] - 1 ] ;
- } else {
- return p_uniques[ i_right[ -i-1 ] - 1] ;
- }
- }
-
- private:
- IntegerVector left ;
- int* left_ptr ;
-
- CharacterVector right ;
- CharacterVector uniques ;
- SEXP* p_uniques ;
- IntegerVector i_right ;
-
- JoinVisitorImpl<INTSXP,INTSXP> int_visitor ;
-
-
- } ;
-
- class JoinStringFactorVisitor : public JoinVisitor {
- public:
- typedef CharacterVector Vec ;
-
- JoinStringFactorVisitor( const CharacterVector& left_, const IntegerVector& right_ ) :
- left(left_),
- i_right(right_),
- uniques( get_uniques(i_right.attr("levels"), left_) ),
- p_uniques( internal::r_vector_start<STRSXP>(uniques) ),
- i_left( match(left_, uniques) ),
-
- int_visitor(i_left, i_right)
- {}
-
- inline size_t hash(int i){
- return int_visitor.hash(i) ;
- }
-
- inline bool equal( int i, int j){
- return int_visitor.equal(i,j) ;
- }
-
- inline SEXP subset( const std::vector<int>& indices ) {
- RObject res = Subsetter<JoinStringFactorVisitor>(*this).subset(indices) ;
- // copy_most_attributes(res, left) ;
- return res;
- }
-
- inline SEXP subset( const VisitorSetIndexSet<DataFrameJoinVisitors>& set ) {
- RObject res = Subsetter<JoinStringFactorVisitor>(*this).subset(set) ;
- // copy_most_attributes(res, left) ;
- return res ;
- }
-
- inline SEXP get(int i) const {
- SEXP res ;
-
- if( i>=0 ){
- res = p_uniques[ i_left[i] - 1 ] ;
- } else {
- int index = -i-1 ;
- if( i_right[index] == NA_INTEGER ) {
- res = NA_STRING ;
- } else {
- res = p_uniques[ i_right[index] - 1 ] ;
- }
- }
-
- return res ;
- }
-
- private:
- CharacterVector left ;
- IntegerVector i_right ;
- CharacterVector uniques ;
- SEXP* p_uniques ;
- IntegerVector i_left ;
-
- JoinVisitorImpl<INTSXP, INTSXP> int_visitor ;
-
- } ;
-
-
-
- class POSIXctJoinVisitor : public JoinVisitorImpl<REALSXP,REALSXP> {
- public:
- typedef JoinVisitorImpl<REALSXP,REALSXP> Parent ;
- POSIXctJoinVisitor( NumericVector left, NumericVector right) :
- Parent(left, right),
- tzone(R_NilValue)
- {
- RObject tzone_left = left.attr("tzone") ;
- RObject tzone_right = right.attr("tzone") ;
- if( tzone_left.isNULL() && tzone_right.isNULL() ) return ;
-
- if( tzone_left.isNULL() ) {
- tzone = tzone_right ;
- } else if( tzone_right.isNULL() ) {
- tzone = tzone_left ;
- } else {
- std::string s_left = as<std::string>( tzone_left ) ;
- std::string s_right = as<std::string>( tzone_right ) ;
-
- if( s_left == s_right){
- tzone = wrap(s_left) ;
- } else {
- tzone = wrap("UTC") ;
- }
- }
- }
-
- inline SEXP subset( const std::vector<int>& indices ){
- return promote( Parent::subset( indices ) ) ;
- }
- inline SEXP subset( const VisitorSetIndexSet<DataFrameJoinVisitors>& set ) {
- return promote( Parent::subset(set)) ;
- }
-
- private:
- RObject tzone ;
-
- inline SEXP promote( NumericVector x){
- x.attr("class") = Rcpp::CharacterVector::create("POSIXct", "POSIXt") ;
- if( !tzone.isNULL() ){
- x.attr("tzone") = tzone ;
- }
- return x ;
- }
-
- } ;
-
- JoinVisitor* join_visitor( SEXP, SEXP, const std::string&, const std::string&, bool warn ) ;
-
- class DateJoinVisitorGetter {
- public:
- virtual ~DateJoinVisitorGetter(){} ;
- virtual double get(int i) = 0 ;
- } ;
-
- template <int RTYPE>
- class DateJoinVisitorGetterImpl : public DateJoinVisitorGetter {
- public:
- DateJoinVisitorGetterImpl( SEXP x) : data(x){}
-
- inline double get(int i){
- return (double) data[i] ;
- }
-
- private:
- Vector<RTYPE> data ;
- } ;
-
- class DateJoinVisitor : public JoinVisitor, public comparisons<REALSXP>{
- public:
- typedef NumericVector Vec ;
- typedef comparisons<REALSXP> Compare ;
- typedef boost::hash<double> hasher ;
-
- DateJoinVisitor( SEXP lhs, SEXP rhs)
- {
- if( TYPEOF(lhs) == INTSXP ) {
- left = new DateJoinVisitorGetterImpl<INTSXP>(lhs) ;
- } else if( TYPEOF(lhs) == REALSXP) {
- left = new DateJoinVisitorGetterImpl<REALSXP>(lhs) ;
- } else {
- stop("Date objects should be represented as integer or numeric") ;
- }
-
- if( TYPEOF(rhs) == INTSXP) {
- right = new DateJoinVisitorGetterImpl<INTSXP>(rhs) ;
- } else if( TYPEOF(rhs) == REALSXP) {
- right = new DateJoinVisitorGetterImpl<REALSXP>(rhs) ;
- } else {
- stop("Date objects should be represented as integer or numeric") ;
- }
-
- }
-
- ~DateJoinVisitor(){
- delete left ;
- delete right;
- }
-
- inline size_t hash(int i) {
- return hash_fun( get(i) ) ;
- }
- inline bool equal(int i, int j) {
- return Compare::equal_or_both_na(
- get(i), get(j)
- ) ;
- }
-
- inline SEXP subset( const std::vector<int>& indices ) {
- NumericVector res = Subsetter<DateJoinVisitor>(*this).subset(indices) ;
- res.attr("class") = "Date" ;
- return res ;
- }
-
- inline SEXP subset( const VisitorSetIndexSet<DataFrameJoinVisitors>& set ) {
- NumericVector res = Subsetter<DateJoinVisitor>(*this).subset(set) ;
- res.attr("class") = "Date" ;
- return res ;
- }
-
- inline double get( int i) const {
- if( i>= 0 ){
- return left->get(i) ;
- } else {
- return right->get(-i-1) ;
- }
- }
-
- private:
- DateJoinVisitorGetter* left ;
- DateJoinVisitorGetter* right ;
- hasher hash_fun ;
-
- DateJoinVisitor( const DateJoinVisitor& ) ;
-
-
- } ;
-
+#include <tools/utils.h>
+#include <tools/match.h>
+
+#include <dplyr/join_match.h>
+#include <dplyr/JoinVisitor.h>
+#include <dplyr/Column.h>
+
+namespace dplyr {
+
+CharacterVector get_uniques(const CharacterVector& left, const CharacterVector& right);
+
+void check_attribute_compatibility(const Column& left, const Column& right);
+
+template <int LHS_RTYPE, int RHS_RTYPE>
+class DualVector {
+public:
+ enum { RTYPE = (LHS_RTYPE > RHS_RTYPE ? LHS_RTYPE : RHS_RTYPE) };
+
+ typedef Vector<LHS_RTYPE> LHS_Vec;
+ typedef Vector<RHS_RTYPE> RHS_Vec;
+ typedef Vector<RTYPE> Vec;
+
+ typedef typename Rcpp::traits::storage_type<LHS_RTYPE>::type LHS_STORAGE;
+ typedef typename Rcpp::traits::storage_type<RHS_RTYPE>::type RHS_STORAGE;
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+
+public:
+ DualVector(LHS_Vec left_, RHS_Vec right_) : left(left_), right(right_) {}
+
+ LHS_STORAGE get_left_value(const int i) const {
+ if (i < 0) stop("get_left_value() called with negative argument");
+ return left[i];
+ }
+
+ RHS_STORAGE get_right_value(const int i) const {
+ if (i >= 0) stop("get_right_value() called with nonnegative argument");
+ return right[-i - 1];
+ }
+
+ bool is_left_na(const int i) const {
+ return left.is_na(get_left_value(i));
+ }
+
+ bool is_right_na(const int i) const {
+ return right.is_na(get_right_value(i));
+ }
+
+ bool is_na(const int i) const {
+ if (i >= 0) return is_left_na(i);
+ else return is_right_na(i);
+ }
+
+ LHS_STORAGE get_value_as_left(const int i) const {
+ if (i >= 0) return get_left_value(i);
+ else {
+ RHS_STORAGE x = get_right_value(i);
+ if (LHS_RTYPE == RHS_RTYPE) return x;
+ else return Rcpp::internal::r_coerce<RHS_RTYPE, LHS_RTYPE>(x);
+ }
+ }
+
+ RHS_STORAGE get_value_as_right(const int i) const {
+ if (i >= 0) {
+ LHS_STORAGE x = get_left_value(i);
+ if (LHS_RTYPE == RHS_RTYPE) return x;
+ else return Rcpp::internal::r_coerce<LHS_RTYPE, RHS_RTYPE>(x);
+ }
+ else return get_right_value(i);
+ }
+
+ STORAGE get_value(const int i) const {
+ if (RTYPE == LHS_RTYPE) return get_value_as_left(i);
+ else return get_value_as_right(i);
+ }
+
+ template <class iterator>
+ SEXP subset(iterator it, const int n) {
+ // We use the fact that LGLSXP < INTSXP < REALSXP, this defines our coercion precedence
+ RObject ret;
+ if (LHS_RTYPE == RHS_RTYPE)
+ ret = subset_same(it, n);
+ else if (LHS_RTYPE > RHS_RTYPE)
+ ret = subset_left(it, n);
+ else
+ ret = subset_right(it, n);
+
+ copy_most_attributes(ret, left);
+ return ret;
+ }
+
+ template <class iterator>
+ SEXP subset_same(iterator it, const int n) {
+ Vec res = no_init(n);
+ for (int i = 0; i < n; i++, ++it) {
+ res[i] = get_value(*it);
+ }
+ return res;
+ }
+
+ template <class iterator>
+ SEXP subset_left(iterator it, const int n) {
+ LHS_Vec res = no_init(n);
+ for (int i = 0; i < n; i++, ++it) {
+ res[i] = get_value_as_left(*it);
+ }
+ return res;
+ }
+
+ template <class iterator>
+ SEXP subset_right(iterator it, const int n) {
+ RHS_Vec res = no_init(n);
+ for (int i = 0; i < n; i++, ++it) {
+ res[i] = get_value_as_right(*it);
+ }
+ return res;
+ }
+
+private:
+ LHS_Vec left;
+ RHS_Vec right;
+};
+
+template <int LHS_RTYPE, int RHS_RTYPE, bool ACCEPT_NA_MATCH>
+class JoinVisitorImpl : public JoinVisitor {
+protected:
+ typedef DualVector<LHS_RTYPE, RHS_RTYPE> Storage;
+ typedef boost::hash<typename Storage::STORAGE> hasher;
+ typedef typename Storage::LHS_Vec LHS_Vec;
+ typedef typename Storage::RHS_Vec RHS_Vec;
+ typedef typename Storage::Vec Vec;
+
+public:
+ JoinVisitorImpl(const Column& left, const Column& right, const bool warn) : dual((SEXP)left.get_data(), (SEXP)right.get_data()) {
+ if (warn) check_attribute_compatibility(left, right);
+ }
+
+ inline size_t hash(int i) {
+ // If NAs don't match, we want to distribute their hashes as evenly as possible
+ if (!ACCEPT_NA_MATCH && dual.is_na(i)) return static_cast<size_t>(i);
+
+ typename Storage::STORAGE x = dual.get_value(i);
+ return hash_fun(x);
+ }
+
+ inline bool equal(int i, int j) {
+ if (LHS_RTYPE == RHS_RTYPE) {
+ // Shortcut for same data type
+ return join_match<LHS_RTYPE, LHS_RTYPE, ACCEPT_NA_MATCH>::is_match(dual.get_value(i), dual.get_value(j));
+ }
+ else if (i >= 0 && j >= 0) {
+ return join_match<LHS_RTYPE, LHS_RTYPE, ACCEPT_NA_MATCH>::is_match(dual.get_left_value(i), dual.get_left_value(j));
+ } else if (i < 0 && j < 0) {
+ return join_match<RHS_RTYPE, RHS_RTYPE, ACCEPT_NA_MATCH>::is_match(dual.get_right_value(i), dual.get_right_value(j));
+ } else if (i >= 0 && j < 0) {
+ return join_match<LHS_RTYPE, RHS_RTYPE, ACCEPT_NA_MATCH>::is_match(dual.get_left_value(i), dual.get_right_value(j));
+ } else {
+ return join_match<RHS_RTYPE, LHS_RTYPE, ACCEPT_NA_MATCH>::is_match(dual.get_right_value(i), dual.get_left_value(j));
+ }
+ }
+
+ SEXP subset(const std::vector<int>& indices) {
+ return dual.subset(indices.begin(), indices.size());
+ }
+
+ SEXP subset(const VisitorSetIndexSet<DataFrameJoinVisitors>& set) {
+ return dual.subset(set.begin(), set.size());
+ }
+
+public:
+ hasher hash_fun;
+
+private:
+ Storage dual;
+};
+
+template <bool ACCEPT_NA_MATCH>
+class POSIXctJoinVisitor : public JoinVisitorImpl<REALSXP, REALSXP, ACCEPT_NA_MATCH> {
+ typedef JoinVisitorImpl<REALSXP, REALSXP, ACCEPT_NA_MATCH> Parent;
+
+public:
+ POSIXctJoinVisitor(const Column& left, const Column& right) :
+ Parent(left, right, false),
+ tzone(R_NilValue)
+ {
+ RObject tzone_left = left.get_data().attr("tzone");
+ RObject tzone_right = right.get_data().attr("tzone");
+ if (tzone_left.isNULL() && tzone_right.isNULL()) return;
+
+ if (tzone_left.isNULL()) {
+ tzone = tzone_right;
+ } else if (tzone_right.isNULL()) {
+ tzone = tzone_left;
+ } else {
+ std::string s_left = as<std::string>(tzone_left);
+ std::string s_right = as<std::string>(tzone_right);
+
+ if (s_left == s_right) {
+ tzone = wrap(s_left);
+ } else {
+ tzone = wrap("UTC");
+ }
+ }
+ }
+
+ inline SEXP subset(const std::vector<int>& indices) {
+ return promote(Parent::subset(indices));
+ }
+ inline SEXP subset(const VisitorSetIndexSet<DataFrameJoinVisitors>& set) {
+ return promote(Parent::subset(set));
+ }
+
+private:
+ inline SEXP promote(NumericVector x) {
+ set_class(x, Rcpp::CharacterVector::create("POSIXct", "POSIXt"));
+ if (!tzone.isNULL()) {
+ x.attr("tzone") = tzone;
+ }
+ return x;
+ }
+
+private:
+ RObject tzone;
+};
+
+class DateJoinVisitorGetter {
+public:
+ virtual ~DateJoinVisitorGetter() {};
+ virtual double get(int i) = 0;
+ virtual bool is_na(int i) const = 0;
+};
+
+template <int LHS_RTYPE, int RHS_RTYPE, bool ACCEPT_NA_MATCH>
+class DateJoinVisitor : public JoinVisitorImpl<LHS_RTYPE, RHS_RTYPE, ACCEPT_NA_MATCH> {
+ typedef JoinVisitorImpl<LHS_RTYPE, RHS_RTYPE, ACCEPT_NA_MATCH> Parent;
+
+public:
+ DateJoinVisitor(const Column& left, const Column& right) : Parent(left, right, false) {}
+
+ inline SEXP subset(const std::vector<int>& indices) {
+ return promote(Parent::subset(indices));
+ }
+
+ inline SEXP subset(const VisitorSetIndexSet<DataFrameJoinVisitors>& set) {
+ return promote(Parent::subset(set));
+ }
+
+private:
+ static SEXP promote(SEXP x) {
+ set_class(x, "Date");
+ return x;
+ }
+
+private:
+ typename Parent::hasher hash_fun;
+};
}
diff --git a/inst/include/dplyr/MatrixColumnSubsetVectorVisitor.h b/inst/include/dplyr/MatrixColumnSubsetVectorVisitor.h
index 6a48528..391a52b 100644
--- a/inst/include/dplyr/MatrixColumnSubsetVectorVisitor.h
+++ b/inst/include/dplyr/MatrixColumnSubsetVectorVisitor.h
@@ -3,98 +3,82 @@
namespace dplyr {
- template <int RTYPE>
- class MatrixColumnSubsetVisitor : public SubsetVectorVisitor {
- public:
-
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
- typedef typename Matrix<RTYPE>::Column Column ;
-
- MatrixColumnSubsetVisitor( const Matrix<RTYPE>& data_ ) : data(data_){}
-
- inline SEXP subset( const Rcpp::IntegerVector& index ) const {
- return subset_int( index ) ;
- }
-
- inline SEXP subset( const std::vector<int>& index ) const {
- return subset_int( index ) ;
- }
-
- inline SEXP subset( const SlicingIndex& index ) const {
- return subset_int( index ) ;
- }
-
- inline SEXP subset( const ChunkIndexMap& index ) const {
- int n = index.size() ;
- int nc = data.ncol() ;
- Matrix<RTYPE> res( n, data.ncol() ) ;
- for( int h=0; h<nc; h++){
- ChunkIndexMap::const_iterator it = index.begin();
- Column column = res.column(h) ;
- Column source_column = const_cast<Matrix<RTYPE>&>(data).column(h) ;
-
- for( int i=0; i<n; i++, ++it){
- column[i] = source_column[ it->first ] ;
- }
- }
- return res ;
- }
-
- inline SEXP subset( const Rcpp::LogicalVector& index ) const {
- int n = output_size(index) ;
- int nc = data.ncol() ;
- Matrix<RTYPE> res(n, data.ncol()) ;
- for( int h=0; h<nc; h++){
- Column column = res.column(h) ;
- Column source_column = const_cast<Matrix<RTYPE>&>(data).column(h) ;
-
- for( int i=0, k=0; k<n; k++, i++ ) {
- while( index[i] != TRUE ) i++;
- column[k] = source_column[i] ;
- }
- }
- return res ;
- }
-
- inline SEXP subset( EmptySubset index ) const {
- return Matrix<RTYPE>( 0, data.ncol() );
- }
-
- inline int size() const {
- return data.nrow() ;
- }
-
- inline std::string get_r_type() const {
- return "matrix" ;
- }
-
- inline bool is_compatible( SubsetVectorVisitor* other, std::stringstream&, const std::string& ) const {
- return true ;
- }
-
- private:
-
- template <typename Container>
- inline SEXP subset_int( const Container& index ) const {
- int n = index.size(), nc = data.ncol() ;
- Matrix<RTYPE> res( n, nc ) ;
- for( int h=0; h<nc; h++){
- Column column = res.column(h) ;
- Column source_column = const_cast<Matrix<RTYPE>&>(data).column(h) ;
- for(int k=0; k< n; k++){
- int idx = index[k] ;
- if( idx < 0){
- column[k] = Vector<RTYPE>::get_na() ;
- } else {
- column[k] = source_column[ index[k] ] ;
- }
- }
- }
- return res ;
+template <int RTYPE>
+class MatrixColumnSubsetVisitor : public SubsetVectorVisitor {
+public:
+
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+ typedef typename Matrix<RTYPE>::Column Column;
+
+ MatrixColumnSubsetVisitor(const Matrix<RTYPE>& data_) : data(data_) {}
+
+ inline SEXP subset(const Rcpp::IntegerVector& index) const {
+ return subset_int(index);
+ }
+
+ inline SEXP subset(const std::vector<int>& index) const {
+ return subset_int(index);
+ }
+
+ inline SEXP subset(const SlicingIndex& index) const {
+ return subset_int(index);
+ }
+
+ inline SEXP subset(const ChunkIndexMap& index) const {
+ int n = index.size();
+ int nc = data.ncol();
+ Matrix<RTYPE> res(n, data.ncol());
+ for (int h = 0; h < nc; h++) {
+ ChunkIndexMap::const_iterator it = index.begin();
+ Column column = res.column(h);
+ Column source_column = const_cast<Matrix<RTYPE>&>(data).column(h);
+
+ for (int i = 0; i < n; i++, ++it) {
+ column[i] = source_column[ it->first ];
+ }
+ }
+ return res;
+ }
+
+ inline SEXP subset(EmptySubset) const {
+ return Matrix<RTYPE>(0, data.ncol());
+ }
+
+ inline int size() const {
+ return data.nrow();
+ }
+
+ inline std::string get_r_type() const {
+ return "matrix";
+ }
+
+ inline bool is_compatible(SubsetVectorVisitor* other, std::stringstream&, const SymbolString&) const {
+ return is_same_typeid(other);
+ }
+
+private:
+
+ template <typename Container>
+ inline SEXP subset_int(const Container& index) const {
+ int n = index.size(), nc = data.ncol();
+ Matrix<RTYPE> res(n, nc);
+ for (int h = 0; h < nc; h++) {
+ Column column = res.column(h);
+ Column source_column = const_cast<Matrix<RTYPE>&>(data).column(h);
+ for (int k = 0; k < n; k++) {
+ int idx = index[k];
+ if (idx < 0) {
+ column[k] = Vector<RTYPE>::get_na();
+ } else {
+ column[k] = source_column[ index[k] ];
}
+ }
+ }
+ return res;
+ }
- Matrix<RTYPE> data ;
- } ;
+ Matrix<RTYPE> data;
+};
}
diff --git a/inst/include/dplyr/MatrixColumnVisitor.h b/inst/include/dplyr/MatrixColumnVisitor.h
index f96ccce..0e07249 100644
--- a/inst/include/dplyr/MatrixColumnVisitor.h
+++ b/inst/include/dplyr/MatrixColumnVisitor.h
@@ -1,122 +1,120 @@
#ifndef dplyr_MatrixColumnVisitor_H
#define dplyr_MatrixColumnVisitor_H
+#include <dplyr/comparisons.h>
+
namespace dplyr {
- template <int RTYPE>
- class MatrixColumnVisitor : public VectorVisitor {
- public:
-
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
- typedef typename Matrix<RTYPE>::Column Column ;
-
- class ColumnVisitor : public comparisons<RTYPE> {
- public:
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
- typedef comparisons<RTYPE> compare ;
- typedef boost::hash<STORAGE> hasher ;
-
- ColumnVisitor( Matrix<RTYPE>& data, int column ) :
- column( data.column(column) ) {}
-
- inline size_t hash(int i) const {
- return hash_fun( const_cast<Column&>(column)[i] ) ;
- }
-
- inline bool equal( int i, int j ) const {
- return compare::equal_or_both_na( const_cast<Column&>(column)[i], const_cast<Column&>(column)[j] ) ;
- }
-
- inline bool less(int i, int j) const {
- return compare::is_less( const_cast<Column&>(column)[i], const_cast<Column&>(column)[j] ) ;
- }
-
- inline bool equal_or_both_na(int i, int j) const {
- return compare::equal_or_both_na( const_cast<Column&>(column)[i], const_cast<Column&>(column)[j] ) ;
- }
-
- inline bool greater(int i, int j) const {
- return compare::is_greater( const_cast<Column&>(column)[i], const_cast<Column&>(column)[j] ) ;
- }
-
- private:
- Column column ;
- hasher hash_fun ;
- } ;
-
- MatrixColumnVisitor( const Matrix<RTYPE>& data_ ) : data(data_), visitors() {
- for( int h=0; h<data.ncol(); h++){
- visitors.push_back( ColumnVisitor( data, h ) ) ;
- }
- }
-
- inline size_t hash(int i) const {
- size_t seed = visitors[0].hash(i) ;
- for( size_t h=1; h<visitors.size(); h++){
- boost::hash_combine( seed, visitors[h].hash(i) ) ;
- }
- return seed ;
- }
-
- inline bool equal(int i, int j) const {
- if( i == j ) return true ;
- for( size_t h=0; h<visitors.size(); h++){
- if( !visitors[h].equal(i,j) ) return false ;
- }
- return true ;
- }
-
- inline bool equal_or_both_na(int i, int j) const {
- if( i == j ) return true ;
- for( size_t h=0; h<visitors.size(); h++){
- if( !visitors[h].equal_or_both_na(i,j) ) return false ;
- }
- return true ;
- }
-
- inline bool less( int i, int j ) const {
- if( i == j ) return false ;
- for( size_t h=0; h<visitors.size(); h++){
- const ColumnVisitor& v = visitors[h] ;
- if( !v.equal(i,j) ){
- return v.less(i,j) ;
- }
- }
- return i < j ;
- }
-
- inline bool greater( int i, int j ) const {
- if( i == j ) return false ;
- for( size_t h=0; h<visitors.size(); h++){
- const ColumnVisitor& v = visitors[h] ;
- if( !v.equal(i,j) ){
- return v.greater(i,j) ;
- }
- }
- return i < j ;
- }
-
- inline int size() const {
- return data.nrow() ;
- }
-
- inline std::string get_r_type() const {
- return "matrix" ;
- }
-
- inline bool is_compatible( VectorVisitor* other, std::stringstream&, const std::string& ) const {
- return true ;
- }
-
- bool is_na( int i ) const {
- return false ;
- }
-
- private:
-
- Matrix<RTYPE> data ;
- std::vector<ColumnVisitor> visitors ;
- } ;
+template <int RTYPE>
+class MatrixColumnVisitor : public VectorVisitor {
+public:
+
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+ typedef typename Matrix<RTYPE>::Column Column;
+
+ class ColumnVisitor {
+ public:
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+ typedef comparisons<RTYPE> compare;
+ typedef boost::hash<STORAGE> hasher;
+
+ ColumnVisitor(Matrix<RTYPE>& data, int column) :
+ column(data.column(column)) {}
+
+ inline size_t hash(int i) const {
+ return hash_fun(const_cast<Column&>(column)[i]);
+ }
+
+ inline bool equal(int i, int j) const {
+ return compare::equal_or_both_na(const_cast<Column&>(column)[i], const_cast<Column&>(column)[j]);
+ }
+
+ inline bool less(int i, int j) const {
+ return compare::is_less(const_cast<Column&>(column)[i], const_cast<Column&>(column)[j]);
+ }
+
+ inline bool equal_or_both_na(int i, int j) const {
+ return compare::equal_or_both_na(const_cast<Column&>(column)[i], const_cast<Column&>(column)[j]);
+ }
+
+ inline bool greater(int i, int j) const {
+ return compare::is_greater(const_cast<Column&>(column)[i], const_cast<Column&>(column)[j]);
+ }
+
+ private:
+ Column column;
+ hasher hash_fun;
+ };
+
+ MatrixColumnVisitor(const Matrix<RTYPE>& data_) : data(data_), visitors() {
+ for (int h = 0; h < data.ncol(); h++) {
+ visitors.push_back(ColumnVisitor(data, h));
+ }
+ }
+
+ inline size_t hash(int i) const {
+ size_t seed = visitors[0].hash(i);
+ for (size_t h = 1; h < visitors.size(); h++) {
+ boost::hash_combine(seed, visitors[h].hash(i));
+ }
+ return seed;
+ }
+
+ inline bool equal(int i, int j) const {
+ if (i == j) return true;
+ for (size_t h = 0; h < visitors.size(); h++) {
+ if (!visitors[h].equal(i, j)) return false;
+ }
+ return true;
+ }
+
+ inline bool equal_or_both_na(int i, int j) const {
+ if (i == j) return true;
+ for (size_t h = 0; h < visitors.size(); h++) {
+ if (!visitors[h].equal_or_both_na(i, j)) return false;
+ }
+ return true;
+ }
+
+ inline bool less(int i, int j) const {
+ if (i == j) return false;
+ for (size_t h = 0; h < visitors.size(); h++) {
+ const ColumnVisitor& v = visitors[h];
+ if (!v.equal(i, j)) {
+ return v.less(i, j);
+ }
+ }
+ return i < j;
+ }
+
+ inline bool greater(int i, int j) const {
+ if (i == j) return false;
+ for (size_t h = 0; h < visitors.size(); h++) {
+ const ColumnVisitor& v = visitors[h];
+ if (!v.equal(i, j)) {
+ return v.greater(i, j);
+ }
+ }
+ return i < j;
+ }
+
+ inline int size() const {
+ return data.nrow();
+ }
+
+ inline std::string get_r_type() const {
+ return "matrix";
+ }
+
+ bool is_na(int) const {
+ return false;
+ }
+
+private:
+
+ Matrix<RTYPE> data;
+ std::vector<ColumnVisitor> visitors;
+};
}
diff --git a/inst/include/dplyr/MultipleVectorVisitors.h b/inst/include/dplyr/MultipleVectorVisitors.h
index 946c9d0..3425b97 100644
--- a/inst/include/dplyr/MultipleVectorVisitors.h
+++ b/inst/include/dplyr/MultipleVectorVisitors.h
@@ -1,56 +1,63 @@
#ifndef dplyr_MultipleVectorVisitors_H
#define dplyr_MultipleVectorVisitors_H
+#include <boost/shared_ptr.hpp>
+
+#include <dplyr/visitor_set/VisitorSetMixin.h>
+
+#include <dplyr/visitor.h>
+
namespace dplyr {
- class MultipleVectorVisitors :
- public VisitorSetEqual<MultipleVectorVisitors>,
- public VisitorSetHash<MultipleVectorVisitors>,
- public VisitorSetLess<MultipleVectorVisitors>,
- public VisitorSetGreater<MultipleVectorVisitors> {
-
- private:
- std::vector< boost::shared_ptr<VectorVisitor> > visitors ;
-
- public:
- typedef VectorVisitor visitor_type ;
-
- MultipleVectorVisitors() : visitors(){}
-
- MultipleVectorVisitors(List data) :
- visitors()
- {
- int n = data.size() ;
- for( int i=0; i<n; i++){
- push_back( data[i] ) ;
- }
- }
-
- inline int size() const {
- return visitors.size() ;
- }
- inline VectorVisitor* get(int k) const {
- return visitors[k].get() ;
- }
- inline int nrows() const {
- if( visitors.size() == 0 ){
- stop("need at least one column for nrows()") ;
- }
- return visitors[0]->size() ;
- }
- inline void push_back( SEXP x) {
- visitors.push_back( boost::shared_ptr<VectorVisitor>( visitor(x) ) ) ;
- }
-
- inline bool is_na(int index) const {
- int n = size() ;
- for( int i=0; i<n; i++) if( visitors[i]->is_na(index)) return true ;
- return false ;
- }
-
- } ;
+class MultipleVectorVisitors :
+ public VisitorSetEqual<MultipleVectorVisitors>,
+ public VisitorSetHash<MultipleVectorVisitors>,
+ public VisitorSetLess<MultipleVectorVisitors>,
+ public VisitorSetGreater<MultipleVectorVisitors> {
+
+private:
+ std::vector< boost::shared_ptr<VectorVisitor> > visitors;
+
+public:
+ typedef VectorVisitor visitor_type;
+
+ MultipleVectorVisitors() : visitors() {}
+
+ MultipleVectorVisitors(List data) :
+ visitors()
+ {
+ int n = data.size();
+ for (int i = 0; i < n; i++) {
+ push_back(data[i]);
+ }
+ }
+
+ inline int size() const {
+ return visitors.size();
+ }
+ inline VectorVisitor* get(int k) const {
+ return visitors[k].get();
+ }
+ inline int nrows() const {
+ if (visitors.size() == 0) {
+ stop("Need at least one column for `nrows()`");
+ }
+ return visitors[0]->size();
+ }
+ inline void push_back(SEXP x) {
+ visitors.push_back(boost::shared_ptr<VectorVisitor>(visitor(x)));
+ }
+
+ inline bool is_na(int index) const {
+ int n = size();
+ for (int i = 0; i < n; i++) if (visitors[i]->is_na(index)) return true;
+ return false;
+ }
+
+};
} // namespace dplyr
+#include <dplyr/visitor_impl.h>
#endif
diff --git a/inst/include/dplyr/NamedListAccumulator.h b/inst/include/dplyr/NamedListAccumulator.h
index 06e3b0d..d5b9b9b 100644
--- a/inst/include/dplyr/NamedListAccumulator.h
+++ b/inst/include/dplyr/NamedListAccumulator.h
@@ -1,51 +1,56 @@
#ifndef dplyr_NamedListAccumulator_H
#define dplyr_NamedListAccumulator_H
+#include <tools/SymbolMap.h>
+
+#include <dplyr/checks.h>
+
namespace dplyr {
- template <typename Data>
- class NamedListAccumulator {
- public:
- SymbolMap symbol_map ;
- std::vector<SEXP> data ;
-
- NamedListAccumulator(){}
-
- inline void set(SEXP name, SEXP x){
- if( ! Rcpp::traits::same_type<Data, RowwiseDataFrame>::value )
- check_supported_type(x, name);
-
- SymbolMapIndex index = symbol_map.insert(name) ;
- if( index.origin == NEW ){
- data.push_back(x);
- } else {
- data[ index.pos ] = x ;
- }
-
- }
-
- inline void rm(SEXP name){
- SymbolMapIndex index = symbol_map.rm(name) ;
- if( index.origin != NEW ){
- data.erase( data.begin() + index.pos ) ;
- }
- }
-
- inline operator List() const {
- List out = wrap(data) ;
- out.names() = symbol_map.names ;
- return out ;
- }
-
- inline size_t size() const {
- return data.size() ;
- }
-
- inline CharacterVector names() const {
- return symbol_map.names ;
- }
-
- } ;
+template <typename Data>
+class NamedListAccumulator {
+private:
+ SymbolMap symbol_map;
+ std::vector<RObject> data; // owns the results
+
+public:
+ NamedListAccumulator() {}
+
+ inline void set(const SymbolString& name, RObject x) {
+ if (! Rcpp::traits::same_type<Data, RowwiseDataFrame>::value)
+ check_supported_type(x, name);
+
+ SymbolMapIndex index = symbol_map.insert(name);
+ if (index.origin == NEW) {
+ data.push_back(x);
+ } else {
+ data[ index.pos ] = x;
+ }
+
+ }
+
+ inline void rm(const SymbolString& name) {
+ SymbolMapIndex index = symbol_map.rm(name);
+ if (index.origin != NEW) {
+ data.erase(data.begin() + index.pos);
+ }
+ }
+
+ inline operator List() const {
+ List out = wrap(data);
+ out.names() = symbol_map.get_names();
+ return out;
+ }
+
+ inline size_t size() const {
+ return data.size();
+ }
+
+ inline const SymbolVector names() const {
+ return symbol_map.get_names();
+ }
+
+};
}
#endif
diff --git a/inst/include/dplyr/Order.h b/inst/include/dplyr/Order.h
index 92dfd89..500028e 100644
--- a/inst/include/dplyr/Order.h
+++ b/inst/include/dplyr/Order.h
@@ -1,81 +1,78 @@
#ifndef dplyr_Order_H
#define dplyr_Order_H
+#include <tools/pointer_vector.h>
+
+#include <dplyr/OrderVisitorImpl.h>
+
namespace dplyr {
- class OrderVisitors_Compare ;
-
- class OrderVisitors {
- public:
-
- OrderVisitors( List args, LogicalVector ascending, int n_ ) :
- visitors(n_), n(n_), nrows(0){
- nrows = Rf_length( args[0] );
- for( int i=0; i<n; i++)
- visitors[i] = order_visitor( args[i], ascending[i] );
- }
- OrderVisitors( DataFrame data ) :
- visitors(data.size()), n(data.size()), nrows( data.nrows() )
- {
- for( int i=0; i<n; i++)
- visitors[i] = order_visitor( data[i], true );
- }
-
- OrderVisitors( DataFrame data, CharacterVector names ) :
- visitors(data.size()), n(names.size()), nrows( data.nrows() )
- {
- for( int i=0; i<n; i++){
- String name = names[i] ;
- visitors[i] = order_visitor( data[name], true );
- }
- }
-
- Rcpp::IntegerVector apply() const ;
-
- pointer_vector<OrderVisitor> visitors ;
- int n;
- int nrows ;
- } ;
-
- class OrderVisitors_Compare {
- public:
- OrderVisitors_Compare( const OrderVisitors& obj_ ) : obj(obj_), n(obj.n){}
-
- inline bool operator()(int i, int j) const {
- if( i == j ) return false ;
- for( int k=0; k<n; k++)
- if( ! obj.visitors[k]->equal(i,j) )
- return obj.visitors[k]->before(i, j ) ;
- return i < j ;
- }
-
- private:
- const OrderVisitors& obj ;
- int n ;
-
- } ;
-
- template <typename OrderVisitorClass>
- class Compare_Single_OrderVisitor{
- public:
- Compare_Single_OrderVisitor( const OrderVisitorClass& obj_) : obj(obj_){}
-
- inline bool operator()(int i, int j) const {
- if( i == j ) return false ;
- if( obj.equal(i,j) ) return i<j ;
- return obj.before(i,j) ;
- }
-
- private:
- const OrderVisitorClass& obj ;
- } ;
-
- inline Rcpp::IntegerVector OrderVisitors::apply() const {
- if( nrows == 0 ) return IntegerVector(0);
- IntegerVector x = seq(0, nrows -1 ) ;
- std::sort( x.begin(), x.end(), OrderVisitors_Compare(*this) ) ;
- return x ;
+class OrderVisitors_Compare;
+
+class OrderVisitors {
+public:
+
+ OrderVisitors(List args, LogicalVector ascending, int n_) :
+ visitors(n_), n(n_), nrows(0) {
+ nrows = Rf_length(args[0]);
+ for (int i = 0; i < n; i++) {
+ visitors[i] = order_visitor(args[i], ascending[i], i);
}
+ }
+
+ OrderVisitors(DataFrame data) :
+ visitors(data.size()), n(data.size()), nrows(data.nrows())
+ {
+ for (int i = 0; i < n; i++)
+ visitors[i] = order_visitor(data[i], true, i);
+ }
+
+ Rcpp::IntegerVector apply() const;
+
+ pointer_vector<OrderVisitor> visitors;
+ int n;
+ int nrows;
+};
+
+class OrderVisitors_Compare {
+public:
+ OrderVisitors_Compare(const OrderVisitors& obj_) : obj(obj_), n(obj.n) {}
+
+ inline bool operator()(int i, int j) const {
+ if (i == j) return false;
+ for (int k = 0; k < n; k++)
+ if (! obj.visitors[k]->equal(i, j))
+ return obj.visitors[k]->before(i, j);
+ return i < j;
+ }
+
+private:
+ const OrderVisitors& obj;
+ int n;
+
+};
+
+template <typename OrderVisitorClass>
+class Compare_Single_OrderVisitor {
+public:
+ Compare_Single_OrderVisitor(const OrderVisitorClass& obj_) : obj(obj_) {}
+
+ inline bool operator()(int i, int j) const {
+ if (i == j) return false;
+ if (obj.equal(i, j)) return i < j;
+ return obj.before(i, j);
+ }
+
+private:
+ const OrderVisitorClass& obj;
+};
+
+inline Rcpp::IntegerVector OrderVisitors::apply() const {
+ if (nrows == 0) return IntegerVector(0);
+ IntegerVector x = seq(0, nrows - 1);
+ std::sort(x.begin(), x.end(), OrderVisitors_Compare(*this));
+ return x;
+}
} // namespace dplyr
diff --git a/inst/include/dplyr/OrderVisitor.h b/inst/include/dplyr/OrderVisitor.h
index bf0cd3e..02d71d7 100644
--- a/inst/include/dplyr/OrderVisitor.h
+++ b/inst/include/dplyr/OrderVisitor.h
@@ -5,17 +5,17 @@ namespace dplyr {
class OrderVisitor {
public:
- virtual ~OrderVisitor(){}
+ virtual ~OrderVisitor() {}
- /** are the elements at indices i and j equal */
- virtual bool equal(int i, int j) const = 0 ;
+ /** are the elements at indices i and j equal */
+ virtual bool equal(int i, int j) const = 0;
- /** is the i element less than the j element */
- virtual bool before( int i, int j) const = 0 ;
+ /** is the i element less than the j element */
+ virtual bool before(int i, int j) const = 0;
- virtual SEXP get() = 0 ;
+ virtual SEXP get() = 0;
-} ;
+};
} // namespace dplyr
diff --git a/inst/include/dplyr/OrderVisitorImpl.h b/inst/include/dplyr/OrderVisitorImpl.h
index 745e2bf..46612ae 100644
--- a/inst/include/dplyr/OrderVisitorImpl.h
+++ b/inst/include/dplyr/OrderVisitorImpl.h
@@ -1,246 +1,280 @@
#ifndef dplyr_OrderVectorVisitor_Impl_H
#define dplyr_OrderVectorVisitor_Impl_H
+#include <dplyr/checks.h>
+
+#include <dplyr/comparisons.h>
+
+#include <dplyr/CharacterVectorOrderer.h>
+#include <dplyr/OrderVisitor.h>
+#include <dplyr/DataFrameVisitors.h>
+#include <dplyr/MatrixColumnVisitor.h>
+#include <dplyr/bad.h>
+
namespace dplyr {
- // version used for ascending = true
- template <int RTYPE, bool ascending, typename VECTOR>
- class OrderVectorVisitorImpl : public OrderVisitor, public comparisons<RTYPE> {
- typedef comparisons<RTYPE> compare ;
- public:
- /**
- * The type of data : int, double, SEXP, Rcomplex
- */
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
+// version used for ascending = true
+template <int RTYPE, bool ascending, typename VECTOR>
+class OrderVectorVisitorImpl : public OrderVisitor {
+ typedef comparisons<RTYPE> compare;
- OrderVectorVisitorImpl( const VECTOR& vec_ ) : vec(vec_) {}
-
- inline bool equal(int i, int j) const {
- return compare::equal_or_both_na( vec[i], vec[j] ) ;
- }
-
- inline bool before(int i, int j) const {
- return compare::is_less( vec[i], vec[j] ) ;
- }
-
- SEXP get(){ return vec ; }
-
- private:
- VECTOR vec ;
- } ;
+public:
+ /**
+ * The type of data : int, double, SEXP, Rcomplex
+ */
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+
+ OrderVectorVisitorImpl(const VECTOR& vec_) : vec(vec_) {}
+
+ inline bool equal(int i, int j) const {
+ return compare::equal_or_both_na(vec[i], vec[j]);
+ }
+
+ inline bool before(int i, int j) const {
+ return compare::is_less(vec[i], vec[j]);
+ }
+
+ SEXP get() {
+ return vec;
+ }
+
+private:
+ VECTOR vec;
+};
+
+// version used for ascending = false
+template <int RTYPE, typename VECTOR>
+class OrderVectorVisitorImpl<RTYPE, false, VECTOR> : public OrderVisitor {
+ typedef comparisons<RTYPE> compare;
- // version used for ascending = false
- template <int RTYPE, typename VECTOR>
- class OrderVectorVisitorImpl<RTYPE,false, VECTOR> : public OrderVisitor, public comparisons<RTYPE> {
- typedef comparisons<RTYPE> compare ;
- public:
-
- /**
- * The type of data : int, double, SEXP, Rcomplex
- */
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-
- OrderVectorVisitorImpl( const VECTOR& vec_ ) : vec(vec_) {}
-
- inline bool equal(int i, int j) const {
- return compare::equal_or_both_na(vec[i], vec[j]) ;
- }
-
- inline bool before(int i, int j) const {
- return compare::is_greater( vec[i], vec[j] ) ;
- }
-
- SEXP get(){ return vec ; }
-
- private:
- VECTOR vec ;
- } ;
-
- template <bool ascending>
- class OrderCharacterVectorVisitorImpl : public OrderVisitor {
- public:
- OrderCharacterVectorVisitorImpl( const CharacterVector& vec_ ) :
- vec(vec_),
- orders( CharacterVectorOrderer(vec).get() )
- {}
-
- inline bool equal(int i, int j) const {
- return orders.equal(i,j) ;
- }
-
- inline bool before( int i, int j) const{
- return orders.before(i,j);
- }
-
- SEXP get(){ return vec; }
-
- private:
- CharacterVector vec ;
- OrderVectorVisitorImpl<INTSXP, ascending, IntegerVector> orders ;
- } ;
-
- // ---------- data frame columns
+public:
+
+ /**
+ * The type of data : int, double, SEXP, Rcomplex
+ */
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+
+ OrderVectorVisitorImpl(const VECTOR& vec_) : vec(vec_) {}
+
+ inline bool equal(int i, int j) const {
+ return compare::equal_or_both_na(vec[i], vec[j]);
+ }
+
+ inline bool before(int i, int j) const {
+ return compare::is_greater(vec[i], vec[j]);
+ }
+
+ SEXP get() {
+ return vec;
+ }
+
+private:
+ VECTOR vec;
+};
+
+template <bool ascending>
+class OrderCharacterVectorVisitorImpl : public OrderVisitor {
+public:
+ OrderCharacterVectorVisitorImpl(const CharacterVector& vec_) :
+ vec(vec_),
+ orders(CharacterVectorOrderer(vec).get())
+ {}
+
+ inline bool equal(int i, int j) const {
+ return orders.equal(i, j);
+ }
+
+ inline bool before(int i, int j) const {
+ return orders.before(i, j);
+ }
+
+ SEXP get() {
+ return vec;
+ }
+
+private:
+ CharacterVector vec;
+ OrderVectorVisitorImpl<INTSXP, ascending, IntegerVector> orders;
+};
+
+// ---------- data frame columns
- // ascending = true
- template <bool ascending>
- class OrderVisitorDataFrame : public OrderVisitor {
- public:
- OrderVisitorDataFrame( const DataFrame& data_ ) : data(data_), visitors(data) {}
+// ascending = true
+template <bool ascending>
+class OrderVisitorDataFrame : public OrderVisitor {
+public:
+ OrderVisitorDataFrame(const DataFrame& data_) : data(data_), visitors(data) {}
- inline bool equal( int i, int j) const {
- return visitors.equal(i,j) ;
- }
-
- inline bool before( int i, int j) const {
- return visitors.less(i,j) ;
- }
-
- inline SEXP get(){
- return data ;
- }
-
- private:
- DataFrame data ;
- DataFrameVisitors visitors ;
- } ;
-
- template <>
- class OrderVisitorDataFrame<false> : public OrderVisitor{
- public:
- OrderVisitorDataFrame( const DataFrame& data_ ) : data(data_), visitors(data) {}
-
- inline bool equal( int i, int j) const {
- return visitors.equal(i,j) ;
- }
+ inline bool equal(int i, int j) const {
+ return visitors.equal(i, j);
+ }
+
+ inline bool before(int i, int j) const {
+ return visitors.less(i, j);
+ }
+
+ inline SEXP get() {
+ return data;
+ }
+
+private:
+ DataFrame data;
+ DataFrameVisitors visitors;
+};
- inline bool before( int i, int j) const {
- return visitors.greater(i,j) ;
- }
-
- inline SEXP get(){
- return data ;
- }
-
- private:
- DataFrame data ;
- DataFrameVisitors visitors ;
- } ;
-
- // ---------- matrix columns
-
- // ascending = true
- template <int RTYPE, bool ascending>
- class OrderVisitorMatrix : public OrderVisitor {
- public:
- OrderVisitorMatrix( const Matrix<RTYPE>& data_ ) : data(data_), visitors(data) {}
-
- inline bool equal( int i, int j) const {
- return visitors.equal(i,j) ;
- }
-
- inline bool before( int i, int j) const {
- return visitors.less(i,j) ;
- }
-
- inline SEXP get(){
- return data ;
- }
-
- private:
- Matrix<RTYPE> data ;
- MatrixColumnVisitor<RTYPE> visitors ;
- } ;
-
- // ascending = false
- template <int RTYPE>
- class OrderVisitorMatrix<RTYPE, false> : public OrderVisitor {
- public:
- OrderVisitorMatrix( const Matrix<RTYPE>& data_ ) : data(data_), visitors(data) {}
-
- inline bool equal( int i, int j) const {
- return visitors.equal(i,j) ;
- }
-
- inline bool before( int i, int j) const {
- return visitors.greater(i,j) ;
- }
-
- inline SEXP get(){
- return data ;
- }
-
- private:
- Matrix<RTYPE> data ;
- MatrixColumnVisitor<RTYPE> visitors ;
- } ;
-
-
- inline OrderVisitor* order_visitor( SEXP vec, bool ascending );
-
- template <bool ascending>
- OrderVisitor* order_visitor_asc( SEXP vec );
-
- template <bool ascending>
- OrderVisitor* order_visitor_asc_matrix( SEXP vec );
-
- template <bool ascending>
- OrderVisitor* order_visitor_asc_vector( SEXP vec );
-
- inline OrderVisitor* order_visitor( SEXP vec, bool ascending ){
- if ( ascending ){
- return order_visitor_asc<true>(vec);
- }
- else {
- return order_visitor_asc<false>(vec);
- }
- }
+template <>
+class OrderVisitorDataFrame<false> : public OrderVisitor {
+public:
+ OrderVisitorDataFrame(const DataFrame& data_) : data(data_), visitors(data) {}
+
+ inline bool equal(int i, int j) const {
+ return visitors.equal(i, j);
+ }
+
+ inline bool before(int i, int j) const {
+ return visitors.greater(i, j);
+ }
+
+ inline SEXP get() {
+ return data;
+ }
+
+private:
+ DataFrame data;
+ DataFrameVisitors visitors;
+};
+
+// ---------- matrix columns
- template <bool ascending>
- inline OrderVisitor* order_visitor_asc( SEXP vec ) {
- if( Rf_isMatrix(vec) ){
- return order_visitor_asc_matrix<ascending>(vec) ;
- }
- else {
- return order_visitor_asc_vector<ascending>(vec) ;
- }
- }
+// ascending = true
+template <int RTYPE, bool ascending>
+class OrderVisitorMatrix : public OrderVisitor {
+public:
+ OrderVisitorMatrix(const Matrix<RTYPE>& data_) : data(data_), visitors(data) {}
+
+ inline bool equal(int i, int j) const {
+ return visitors.equal(i, j);
+ }
+
+ inline bool before(int i, int j) const {
+ return visitors.less(i, j);
+ }
+
+ inline SEXP get() {
+ return data;
+ }
+
+private:
+ Matrix<RTYPE> data;
+ MatrixColumnVisitor<RTYPE> visitors;
+};
+
+// ascending = false
+template <int RTYPE>
+class OrderVisitorMatrix<RTYPE, false> : public OrderVisitor {
+public:
+ OrderVisitorMatrix(const Matrix<RTYPE>& data_) : data(data_), visitors(data) {}
+
+ inline bool equal(int i, int j) const {
+ return visitors.equal(i, j);
+ }
+
+ inline bool before(int i, int j) const {
+ return visitors.greater(i, j);
+ }
+
+ inline SEXP get() {
+ return data;
+ }
+
+private:
+ Matrix<RTYPE> data;
+ MatrixColumnVisitor<RTYPE> visitors;
+};
+
+
+inline OrderVisitor* order_visitor(SEXP vec, const bool ascending, const int i);
+
+template <bool ascending>
+OrderVisitor* order_visitor_asc(SEXP vec);
- template <bool ascending>
- inline OrderVisitor* order_visitor_asc_matrix( SEXP vec ) {
- switch( check_supported_type(vec) ){
- case DPLYR_INTSXP: return new OrderVisitorMatrix<INTSXP , ascending>( vec ) ;
- case DPLYR_REALSXP: return new OrderVisitorMatrix<REALSXP , ascending>( vec ) ;
- case DPLYR_LGLSXP: return new OrderVisitorMatrix<LGLSXP , ascending>( vec ) ;
- case DPLYR_STRSXP: return new OrderVisitorMatrix<STRSXP , ascending>( vec ) ;
- case DPLYR_CPLXSXP: return new OrderVisitorMatrix<CPLXSXP , ascending>( vec ) ;
- case DPLYR_VECSXP: stop("Matrix can't be a list", Rf_type2char(TYPEOF(vec))) ;
- }
-
- stop("Unreachable") ;
- return 0 ;
+template <bool ascending>
+OrderVisitor* order_visitor_asc_matrix(SEXP vec);
+
+template <bool ascending>
+OrderVisitor* order_visitor_asc_vector(SEXP vec);
+
+inline OrderVisitor* order_visitor(SEXP vec, const bool ascending, const int i) {
+ try {
+ if (ascending) {
+ return order_visitor_asc<true>(vec);
+ }
+ else {
+ return order_visitor_asc<false>(vec);
}
+ }
+ catch (const Rcpp::exception& e) {
+ bad_pos_arg(i + 1, e.what());
+ }
+}
+
+template <bool ascending>
+inline OrderVisitor* order_visitor_asc(SEXP vec) {
+ if (Rf_isMatrix(vec)) {
+ return order_visitor_asc_matrix<ascending>(vec);
+ }
+ else {
+ return order_visitor_asc_vector<ascending>(vec);
+ }
+}
- template <bool ascending>
- inline OrderVisitor* order_visitor_asc_vector( SEXP vec ) {
- switch( TYPEOF(vec) ){
- case INTSXP: return new OrderVectorVisitorImpl<INTSXP , ascending, Vector<INTSXP > >( vec ) ;
- case REALSXP: return new OrderVectorVisitorImpl<REALSXP, ascending, Vector<REALSXP> >( vec ) ;
- case LGLSXP: return new OrderVectorVisitorImpl<LGLSXP , ascending, Vector<LGLSXP > >( vec ) ;
- case STRSXP: return new OrderCharacterVectorVisitorImpl<ascending>( vec ) ;
- case CPLXSXP: return new OrderVectorVisitorImpl<CPLXSXP , ascending, Vector<CPLXSXP > >( vec ) ;
- case VECSXP:
- {
- if( Rf_inherits( vec, "data.frame" ) ){
- return new OrderVisitorDataFrame<ascending>( vec ) ;
- }
- break ;
- }
- default: break ;
- }
-
- stop("Unsupported vector type %s", Rf_type2char(TYPEOF(vec))) ;
- return 0 ;
+template <bool ascending>
+inline OrderVisitor* order_visitor_asc_matrix(SEXP vec) {
+ switch (check_supported_type(vec)) {
+ case DPLYR_INTSXP:
+ return new OrderVisitorMatrix<INTSXP, ascending>(vec);
+ case DPLYR_REALSXP:
+ return new OrderVisitorMatrix<REALSXP, ascending>(vec);
+ case DPLYR_LGLSXP:
+ return new OrderVisitorMatrix<LGLSXP, ascending>(vec);
+ case DPLYR_STRSXP:
+ return new OrderVisitorMatrix<STRSXP, ascending>(vec);
+ case DPLYR_CPLXSXP:
+ return new OrderVisitorMatrix<CPLXSXP, ascending>(vec);
+ case DPLYR_VECSXP:
+ stop("Matrix can't be a list");
+ }
+
+ stop("Unreachable");
+ return 0;
+}
+
+template <bool ascending>
+inline OrderVisitor* order_visitor_asc_vector(SEXP vec) {
+ switch (TYPEOF(vec)) {
+ case INTSXP:
+ return new OrderVectorVisitorImpl<INTSXP, ascending, Vector<INTSXP > >(vec);
+ case REALSXP:
+ return new OrderVectorVisitorImpl<REALSXP, ascending, Vector<REALSXP> >(vec);
+ case LGLSXP:
+ return new OrderVectorVisitorImpl<LGLSXP, ascending, Vector<LGLSXP > >(vec);
+ case STRSXP:
+ return new OrderCharacterVectorVisitorImpl<ascending>(vec);
+ case CPLXSXP:
+ return new OrderVectorVisitorImpl<CPLXSXP, ascending, Vector<CPLXSXP > >(vec);
+ case VECSXP:
+ {
+ if (Rf_inherits(vec, "data.frame")) {
+ return new OrderVisitorDataFrame<ascending>(vec);
}
+ break;
+ }
+ default:
+ break;
+ }
+
+ stop("is of unsupported type %s", Rf_type2char(TYPEOF(vec)));
+}
}
#endif
diff --git a/inst/include/dplyr/Replicator.h b/inst/include/dplyr/Replicator.h
index 0909c2a..71ec4d8 100644
--- a/inst/include/dplyr/Replicator.h
+++ b/inst/include/dplyr/Replicator.h
@@ -1,54 +1,61 @@
#ifndef dplyr_Replicator_H
#define dplyr_Replicator_H
+#include <tools/utils.h>
+
namespace dplyr {
- class Replicator {
- public:
- virtual ~Replicator(){}
- virtual SEXP collect() = 0 ;
- } ;
-
- template <int RTYPE, typename Data>
- class ReplicatorImpl : public Replicator {
- public:
- typedef typename traits::storage_type<RTYPE>::type STORAGE ;
-
- ReplicatorImpl( SEXP v, int n_, int ngroups_) :
- data( no_init(n_*ngroups_) ), source(v), n(n_), ngroups(ngroups_) {}
-
- SEXP collect(){
- for( int i=0, k=0; i<ngroups; i++){
- for( int j=0; j<n; j++, k++){
- data[k] = source[j] ;
- }
- }
- copy_most_attributes( data, source ) ;
- return data ;
- }
-
- private:
- Vector<RTYPE> data ;
- Vector<RTYPE> source ;
- int n ;
- int ngroups ;
- } ;
-
- template <typename Data>
- inline Replicator* replicator( SEXP v, const Data& gdf ){
- int n = Rf_length(v) ;
- switch( TYPEOF(v) ){
- case INTSXP: return new ReplicatorImpl<INTSXP , Data> ( v, n, gdf.ngroups() ) ;
- case REALSXP: return new ReplicatorImpl<REALSXP, Data> ( v, n, gdf.ngroups() ) ;
- case STRSXP: return new ReplicatorImpl<STRSXP , Data> ( v, n, gdf.ngroups() ) ;
- case LGLSXP: return new ReplicatorImpl<LGLSXP , Data> ( v, n, gdf.ngroups() ) ;
- case CPLXSXP: return new ReplicatorImpl<CPLXSXP, Data> ( v, n, gdf.ngroups() ) ;
- default: break ;
- }
-
- stop("Unsupported vector type %s", Rf_type2char(TYPEOF(v))) ;
- return 0 ;
+class Replicator {
+public:
+ virtual ~Replicator() {}
+ virtual SEXP collect() = 0;
+};
+
+template <int RTYPE, typename Data>
+class ReplicatorImpl : public Replicator {
+public:
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+
+ ReplicatorImpl(SEXP v, int n_, int ngroups_) :
+ data(no_init(n_ * ngroups_)), source(v), n(n_), ngroups(ngroups_) {}
+
+ SEXP collect() {
+ for (int i = 0, k = 0; i < ngroups; i++) {
+ for (int j = 0; j < n; j++, k++) {
+ data[k] = source[j];
+ }
}
+ copy_most_attributes(data, source);
+ return data;
+ }
+
+private:
+ Vector<RTYPE> data;
+ Vector<RTYPE> source;
+ int n;
+ int ngroups;
+};
+
+template <typename Data>
+inline Replicator* replicator(SEXP v, const Data& gdf) {
+ int n = Rf_length(v);
+ switch (TYPEOF(v)) {
+ case INTSXP:
+ return new ReplicatorImpl<INTSXP, Data> (v, n, gdf.ngroups());
+ case REALSXP:
+ return new ReplicatorImpl<REALSXP, Data> (v, n, gdf.ngroups());
+ case STRSXP:
+ return new ReplicatorImpl<STRSXP, Data> (v, n, gdf.ngroups());
+ case LGLSXP:
+ return new ReplicatorImpl<LGLSXP, Data> (v, n, gdf.ngroups());
+ case CPLXSXP:
+ return new ReplicatorImpl<CPLXSXP, Data> (v, n, gdf.ngroups());
+ default:
+ break;
+ }
+
+ stop("is of unsupported type %s", Rf_type2char(TYPEOF(v)));
+}
} // namespace dplyr
diff --git a/inst/include/dplyr/Result/CallElementProxy.h b/inst/include/dplyr/Result/CallElementProxy.h
index 1d3a791..599a1eb 100644
--- a/inst/include/dplyr/Result/CallElementProxy.h
+++ b/inst/include/dplyr/Result/CallElementProxy.h
@@ -3,19 +3,19 @@
namespace dplyr {
- class CallElementProxy {
- public:
- CallElementProxy(SEXP symbol_, SEXP object_) :
- symbol(symbol_), object(object_)
- {}
+class CallElementProxy {
+public:
+ CallElementProxy(SEXP symbol_, SEXP object_) :
+ symbol(symbol_), object(object_)
+ {}
- inline void set(SEXP value){
- SETCAR(object, value) ;
- }
+ inline void set(SEXP value) {
+ SETCAR(object, value);
+ }
- SEXP symbol;
- SEXP object;
- } ;
+ SEXP symbol;
+ SEXP object;
+};
}
diff --git a/inst/include/dplyr/Result/CallProxy.h b/inst/include/dplyr/Result/CallProxy.h
index 22bb785..c2db821 100644
--- a/inst/include/dplyr/Result/CallProxy.h
+++ b/inst/include/dplyr/Result/CallProxy.h
@@ -1,70 +1,12 @@
#ifndef dplyr_CallProxy_H
#define dplyr_CallProxy_H
-namespace dplyr {
-
- class CallProxy {
- public:
- CallProxy( const Rcpp::Call& call_, LazySubsets& subsets_, const Environment& env_) :
- call(call_), subsets(subsets_), proxies(), env(env_)
- {
- // fill proxies
- set_call(call);
- }
-
- CallProxy( const Rcpp::Call& call_, const Rcpp::DataFrame& data_, const Environment& env_) :
- call(call_), subsets(data_), proxies(), env(env_)
- {
- // fill proxies
- set_call(call);
- }
-
- CallProxy( const Rcpp::DataFrame& data_, const Environment& env_ ) :
- subsets(data_), proxies(), env(env_){
- }
-
- CallProxy( const Rcpp::DataFrame& data_) :
- subsets(data_), proxies() {
- }
-
- ~CallProxy(){}
-
- SEXP eval() ;
-
- void set_call( SEXP call_ ) ;
+#include <dplyr/Result/GroupedCallProxy.h>
+#include <dplyr/Result/LazySubsets.h>
- void input( Rcpp::String name, SEXP x ){
- subsets.input( name.get_sexp(), x ) ;
- }
-
- inline int nsubsets(){
- return subsets.size() ;
- }
-
- inline SEXP get_variable( Rcpp::String name ) const {
- return subsets.get_variable( Symbol(name) );
- }
-
- inline bool has_variable(SEXP symbol){
- return subsets.count(symbol) ;
- }
-
- inline void set_env(SEXP env_){
- env = env_ ;
- }
-
- private:
-
- bool simplified(const SlicingIndex& indices) ;
- bool replace( SEXP p, const SlicingIndex& indices ) ;
- void traverse_call( SEXP obj ) ;
-
- Rcpp::Call call ;
- LazySubsets subsets ;
- std::vector<CallElementProxy> proxies ;
- Environment env;
+namespace dplyr {
- } ;
+typedef GroupedCallProxy<Rcpp::DataFrame, LazySubsets> CallProxy;
}
diff --git a/inst/include/dplyr/Result/CallbackProcessor.h b/inst/include/dplyr/Result/CallbackProcessor.h
index 5000f10..2a7dea7 100644
--- a/inst/include/dplyr/Result/CallbackProcessor.h
+++ b/inst/include/dplyr/Result/CallbackProcessor.h
@@ -1,89 +1,129 @@
#ifndef dplyr_Result_CallbackProcessor_H
#define dplyr_Result_CallbackProcessor_H
-namespace dplyr{
-
- // classes inherit from this template when they have a method with this signature
- // SEXP process_chunk( const SlicingIndex& indices)
- //
- // the first time process_chunk is called, CallbackProcessor finds the right type
- // for storing the results, and it creates a suitable DelayedProcessor
- // object which is then used to fill the vector
- //
- // DelayedReducer is an example on how CallbackReducer is used
- //
- // it is assumed that the SEXP comes from evaluating some R expression, so
- // it should be one of a integer vector of length one, a numeric vector of
- // length one or a character vector of length one
- template <typename CLASS>
- class CallbackProcessor : public Result {
- public:
- CallbackProcessor(){}
-
- virtual SEXP process( const GroupedDataFrame& gdf){
- return process_data<GroupedDataFrame>( gdf ) ;
- }
+#include <boost/scoped_ptr.hpp>
- virtual SEXP process( const RowwiseDataFrame& gdf){
- return process_data<RowwiseDataFrame>( gdf ) ;
- }
+#include <tools/all_na.h>
- virtual SEXP process( const Rcpp::FullDataFrame& df){
- CLASS* obj = static_cast<CLASS*>(this) ;
- return obj->process_chunk(df.get_index()) ;
- }
+#include <dplyr/Result/Result.h>
+#include <dplyr/Result/DelayedProcessor.h>
- virtual SEXP process( const SlicingIndex& index ){
- return R_NilValue ;
- }
+#include <dplyr/bad.h>
+
+namespace dplyr {
+
+// classes inherit from this template when they have a method with this signature
+// SEXP process_chunk( const SlicingIndex& indices)
+//
+// the first time process_chunk is called, CallbackProcessor finds the right type
+// for storing the results, and it creates a suitable DelayedProcessor
+// object which is then used to fill the vector
+//
+// DelayedReducer is an example on how CallbackReducer is used
+//
+// it is assumed that the SEXP comes from evaluating some R expression, so
+// it should be one of a integer vector of length one, a numeric vector of
+// length one or a character vector of length one
+template <typename CLASS>
+class CallbackProcessor : public Result {
+public:
+ CallbackProcessor() {}
+
+ CLASS* obj() {
+ return static_cast<CLASS*>(this);
+ }
+
+ virtual SEXP process(const GroupedDataFrame& gdf) {
+ return process_data<GroupedDataFrame>(gdf, obj()).run();
+ }
+
+ virtual SEXP process(const RowwiseDataFrame& gdf) {
+ return process_data<RowwiseDataFrame>(gdf, obj()).run();
+ }
+
+ virtual SEXP process(const Rcpp::FullDataFrame& df) {
+ return obj()->process_chunk(df.get_index());
+ }
+
+ virtual SEXP process(const SlicingIndex&) {
+ return R_NilValue;
+ }
+
+private:
- private:
-
- template <typename Data>
- SEXP process_data( const Data& gdf ){
- CLASS* obj = static_cast<CLASS*>( this ) ;
- typename Data::group_iterator git = gdf.group_begin() ;
-
- // the group index
- int i = 0 ;
- int ngroups = gdf.ngroups() ;
- // evaluate the expression within each group until we find something that is not NA
- RObject first_result(R_NilValue) ;
- for( ; i<ngroups; i++, ++git ){
- first_result = obj->process_chunk(*git) ;
- if( ! all_na(first_result) ) break ;
- }
- // all the groups evaluated to NA, so we send a logical vector NA
- // perhaps the type of the vector could depend on something, maybe later
- if( i == ngroups ){
- return LogicalVector(ngroups, NA_LOGICAL) ;
- }
-
- // otherwise, instantiate a DelayedProcessor based on the first non NA
- // result we get
-
- // get the appropriate Delayed Processor to handle it
- boost::scoped_ptr< DelayedProcessor_Base<CLASS> > processor(
- get_delayed_processor<CLASS>(i, first_result, ngroups )
- ) ;
- if(!processor)
- stop( "expecting a single value" );
- for( ; i<ngroups ; i++, ++git ){
- first_result = obj->process_chunk(*git) ;
- if( !processor->handled(i, first_result) ){
- if( processor->can_promote(first_result) ){
- processor.reset(
- processor->promote(i, first_result)
- ) ;
- }
- }
- }
-
- Shield<SEXP> res( processor->get() ) ;
- return res ;
+ template <typename Data>
+ class process_data {
+ public:
+ process_data(const Data& gdf, CLASS* chunk_source_) : git(gdf.group_begin()), ngroups(gdf.ngroups()), chunk_source(chunk_source_) {}
+
+ SEXP run() {
+ if (ngroups == 0) {
+ LOG_INFO << "no groups to process";
+ return get_processed_empty();
+ }
+
+ LOG_INFO << "processing groups";
+ process_first();
+ process_rest();
+ return get_processed();
+ }
+
+ private:
+ void process_first() {
+ const RObject& first_result = fetch_chunk();
+ LOG_INFO << "instantiating delayed processor for type " << type2name(first_result)
+ << " for column `" << chunk_source->get_name().get_utf8_cstring() << "`";
+
+ processor.reset(get_delayed_processor<CLASS>(first_result, ngroups, chunk_source->get_name()));
+ LOG_VERBOSE << "processing " << ngroups << " groups with " << processor->describe() << " processor";
+ }
+
+ void process_rest() {
+ for (int i = 1; i < ngroups; ++i) {
+ const RObject& chunk = fetch_chunk();
+ if (!try_handle_chunk(chunk)) {
+ LOG_VERBOSE << "not handled group " << i;
+ handle_chunk_with_promotion(chunk, i);
}
+ }
+ }
+
+ bool try_handle_chunk(const RObject& chunk) const {
+ return processor->try_handle(chunk);
+ }
+
+ void handle_chunk_with_promotion(const RObject& chunk, const int i) {
+ IDelayedProcessor* new_processor = processor->promote(chunk);
+ if (!new_processor) {
+ bad_col(chunk_source->get_name(), "can't promote group {group} to {type}",
+ _["group"] = i, _["type"] = processor->describe());
+ }
+
+ LOG_VERBOSE << "promoted group " << i;
+ processor.reset(new_processor);
+ }
+
+ RObject fetch_chunk() {
+ const RObject& chunk = chunk_source->process_chunk(*git);
+ ++git;
+ return chunk;
+ }
+
+ SEXP get_processed() const {
+ return processor->get();
+ }
+
+ static SEXP get_processed_empty() {
+ return LogicalVector(0, NA_LOGICAL);
+ }
- } ;
+ private:
+ typename Data::group_iterator git;
+ const int ngroups;
+ boost::scoped_ptr<IDelayedProcessor> processor;
+ CLASS* chunk_source;
+ };
+};
}
#endif
diff --git a/inst/include/dplyr/Result/ConstantResult.h b/inst/include/dplyr/Result/ConstantResult.h
index 0678e2e..e1b1f44 100644
--- a/inst/include/dplyr/Result/ConstantResult.h
+++ b/inst/include/dplyr/Result/ConstantResult.h
@@ -1,108 +1,110 @@
#ifndef dplyr_Result_ConstantResult_H
#define dplyr_Result_ConstantResult_H
+#include <dplyr/Result/Result.h>
+
namespace dplyr {
- template <int RTYPE>
- class ConstantResult : public Result {
- public:
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
+template <int RTYPE>
+class ConstantResult : public Result {
+public:
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
- ConstantResult(SEXP x) : value( Rcpp::internal::r_vector_start<RTYPE>(x)[0] ) {}
+ ConstantResult(SEXP x) : value(Rcpp::internal::r_vector_start<RTYPE>(x)[0]) {}
- SEXP process( const GroupedDataFrame& gdf) {
- return Vector<RTYPE>( gdf.ngroups(), value ) ;
- }
+ SEXP process(const GroupedDataFrame& gdf) {
+ return Vector<RTYPE>(gdf.ngroups(), value);
+ }
- SEXP process( const RowwiseDataFrame& gdf) {
- return Vector<RTYPE>( gdf.ngroups(), value ) ;
- }
+ SEXP process(const RowwiseDataFrame& gdf) {
+ return Vector<RTYPE>(gdf.ngroups(), value);
+ }
- virtual SEXP process( const FullDataFrame& df) {
- return Vector<RTYPE>::create( value ) ;
- }
+ virtual SEXP process(const FullDataFrame&) {
+ return Vector<RTYPE>::create(value);
+ }
- virtual SEXP process( const SlicingIndex& index ){
- return Vector<RTYPE>::create( value ) ;
- }
+ virtual SEXP process(const SlicingIndex&) {
+ return Vector<RTYPE>::create(value);
+ }
- STORAGE value ;
- } ;
+ STORAGE value;
+};
- template <int RTYPE>
- class TypedConstantResult : public Result {
- public:
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
+template <int RTYPE>
+class TypedConstantResult : public Result {
+public:
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
- TypedConstantResult(SEXP x, SEXP classes_) :
- value( Rcpp::internal::r_vector_start<RTYPE>(x)[0] ), classes(classes_) {}
+ TypedConstantResult(SEXP x, SEXP classes_) :
+ value(Rcpp::internal::r_vector_start<RTYPE>(x)[0]), classes(classes_) {}
- SEXP process( const GroupedDataFrame& gdf) {
- return get(gdf.ngroups()) ;
- }
+ SEXP process(const GroupedDataFrame& gdf) {
+ return get(gdf.ngroups());
+ }
- SEXP process( const RowwiseDataFrame& gdf) {
- return get(gdf.ngroups()) ;
- }
+ SEXP process(const RowwiseDataFrame& gdf) {
+ return get(gdf.ngroups());
+ }
- virtual SEXP process( const FullDataFrame& df) {
- return get(1);
- }
+ virtual SEXP process(const FullDataFrame&) {
+ return get(1);
+ }
- virtual SEXP process( const SlicingIndex& index ){
- return get(1);
- }
+ virtual SEXP process(const SlicingIndex&) {
+ return get(1);
+ }
- private:
+private:
- SEXP get( int n ) const {
- Vector<RTYPE> res(n, value);
- res.attr("class") = classes ;
- return res ;
- }
+ SEXP get(int n) const {
+ Vector<RTYPE> res(n, value);
+ set_class(res, classes);
+ return res;
+ }
- STORAGE value ;
- SEXP classes ;
- } ;
+ STORAGE value;
+ SEXP classes;
+};
- template <int RTYPE>
- class DifftimeConstantResult : public Result {
- public:
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
+template <int RTYPE>
+class DifftimeConstantResult : public Result {
+public:
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
- DifftimeConstantResult(SEXP x) :
- value( Rcpp::internal::r_vector_start<RTYPE>(x)[0] ),
- units(Rf_getAttrib(x, Rf_install("units")))
- {}
+ DifftimeConstantResult(SEXP x) :
+ value(Rcpp::internal::r_vector_start<RTYPE>(x)[0]),
+ units(Rf_getAttrib(x, Rf_install("units")))
+ {}
- SEXP process( const GroupedDataFrame& gdf) {
- return get(gdf.ngroups()) ;
- }
+ SEXP process(const GroupedDataFrame& gdf) {
+ return get(gdf.ngroups());
+ }
- SEXP process( const RowwiseDataFrame& gdf) {
- return get(gdf.ngroups()) ;
- }
+ SEXP process(const RowwiseDataFrame& gdf) {
+ return get(gdf.ngroups());
+ }
- virtual SEXP process( const FullDataFrame& df) {
- return get(1);
- }
+ virtual SEXP process(const FullDataFrame&) {
+ return get(1);
+ }
- virtual SEXP process( const SlicingIndex& index ){
- return get(1);
- }
+ virtual SEXP process(const SlicingIndex&) {
+ return get(1);
+ }
- private:
+private:
- SEXP get( int n ) const {
- Vector<RTYPE> res(n, value);
- res.attr("class") = "difftime" ;
- res.attr("units") = units ;
- return res ;
- }
+ SEXP get(int n) const {
+ Vector<RTYPE> res(n, value);
+ set_class(res, "difftime");
+ res.attr("units") = units;
+ return res;
+ }
- STORAGE value ;
- CharacterVector units ;
- } ;
+ STORAGE value;
+ CharacterVector units;
+};
}
diff --git a/inst/include/dplyr/Result/Count.h b/inst/include/dplyr/Result/Count.h
index f9cd51a..2cb147d 100644
--- a/inst/include/dplyr/Result/Count.h
+++ b/inst/include/dplyr/Result/Count.h
@@ -1,17 +1,19 @@
#ifndef dplyr_Result_Count_H
#define dplyr_Result_Count_H
+#include <dplyr/Result/Processor.h>
+
namespace dplyr {
- class Count : public Processor<INTSXP,Count> {
- public:
- Count() {}
- ~Count(){}
+class Count : public Processor<INTSXP, Count> {
+public:
+ Count() {}
+ ~Count() {}
- inline int process_chunk( const SlicingIndex& indices ){
- return indices.size() ;
- }
- } ;
+ inline int process_chunk(const SlicingIndex& indices) {
+ return indices.size();
+ }
+};
}
diff --git a/inst/include/dplyr/Result/Count_Distinct.h b/inst/include/dplyr/Result/Count_Distinct.h
index 38a5cf2..8441a05 100644
--- a/inst/include/dplyr/Result/Count_Distinct.h
+++ b/inst/include/dplyr/Result/Count_Distinct.h
@@ -1,60 +1,68 @@
#ifndef dplyr_Result_Count_Distinct_H
#define dplyr_Result_Count_Distinct_H
+#include <tools/hash.h>
+
+#include <dplyr/visitor_set/VisitorEqualPredicate.h>
+#include <dplyr/visitor_set/VisitorHash.h>
+#include <dplyr/Result/Processor.h>
+
namespace dplyr {
- template <typename Visitor>
- class Count_Distinct : public Processor<INTSXP, Count_Distinct<Visitor> > {
- public:
- typedef VisitorHash<Visitor> Hash ;
- typedef VisitorEqualPredicate<Visitor> Pred ;
- typedef dplyr_hash_set<int, Hash, Pred > Set ;
-
- Count_Distinct(Visitor v_):
- v(v_), set(1024, Hash(v), Pred(v) )
- {}
-
- inline int process_chunk( const SlicingIndex& indices ){
- set.clear() ;
- int n = indices.size() ;
- for( int i=0; i<n; i++){
- set.insert( indices[i] ) ;
- }
- return set.size() ;
- }
-
- private:
- Visitor v ;
- Set set ;
- } ;
-
- template <typename Visitor>
- class Count_Distinct_Narm : public Processor<INTSXP, Count_Distinct_Narm<Visitor> > {
- public:
- typedef VisitorHash<Visitor> Hash ;
- typedef VisitorEqualPredicate<Visitor> Pred ;
- typedef dplyr_hash_set<int, Hash, Pred > Set ;
-
- Count_Distinct_Narm(Visitor v_):
- v(v_), set(1024, Hash(v), Pred(v) )
- {}
-
- inline int process_chunk( const SlicingIndex& indices ){
- set.clear() ;
- int n = indices.size() ;
- for( int i=0; i<n; i++){
- int index=indices[i] ;
- if( ! v.is_na(index) ){
- set.insert( index ) ;
- }
- }
- return set.size() ;
- }
-
- private:
- Visitor v ;
- Set set ;
- } ;
+template <typename Visitor>
+class Count_Distinct : public Processor<INTSXP, Count_Distinct<Visitor> > {
+public:
+ typedef VisitorHash<Visitor> Hash;
+ typedef VisitorEqualPredicate<Visitor> Pred;
+ typedef dplyr_hash_set<int, Hash, Pred > Set;
+
+ Count_Distinct(Visitor v_):
+ v(v_), set(0, Hash(v), Pred(v))
+ {}
+
+ inline int process_chunk(const SlicingIndex& indices) {
+ set.clear();
+ set.rehash(indices.size());
+ int n = indices.size();
+ for (int i = 0; i < n; i++) {
+ set.insert(indices[i]);
+ }
+ return set.size();
+ }
+
+private:
+ Visitor v;
+ Set set;
+};
+
+template <typename Visitor>
+class Count_Distinct_Narm : public Processor<INTSXP, Count_Distinct_Narm<Visitor> > {
+public:
+ typedef VisitorHash<Visitor> Hash;
+ typedef VisitorEqualPredicate<Visitor> Pred;
+ typedef dplyr_hash_set<int, Hash, Pred > Set;
+
+ Count_Distinct_Narm(Visitor v_):
+ v(v_), set(0, Hash(v), Pred(v))
+ {}
+
+ inline int process_chunk(const SlicingIndex& indices) {
+ set.clear();
+ set.rehash(indices.size());
+ int n = indices.size();
+ for (int i = 0; i < n; i++) {
+ int index = indices[i];
+ if (! v.is_na(index)) {
+ set.insert(index);
+ }
+ }
+ return set.size();
+ }
+
+private:
+ Visitor v;
+ Set set;
+};
}
diff --git a/inst/include/dplyr/Result/CumMax.h b/inst/include/dplyr/Result/CumMax.h
index 373ad52..7cfe4ed 100644
--- a/inst/include/dplyr/Result/CumMax.h
+++ b/inst/include/dplyr/Result/CumMax.h
@@ -1,42 +1,44 @@
#ifndef dplyr_Result_CumMax_H
#define dplyr_Result_CumMax_H
+#include <dplyr/Result/Mutater.h>
+
namespace dplyr {
- // version for REALSXP
- template <int RTYPE>
- class CumMax : public Mutater<RTYPE, CumMax<RTYPE> > {
- public:
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-
- CumMax(SEXP data_) : data(data_){}
-
- void process_slice( Vector<RTYPE>& out, const SlicingIndex& index, const SlicingIndex& out_index){
- int n = index.size() ;
- STORAGE value = data[index[0]] ;
- out[out_index[0]] = value ;
- if( NumericVector::is_na(value) ){
- for( int i=1; i<n; i++){
- out[out_index[i]] = value ;
- }
- return ;
- }
-
- for( int i=1; i<n; i++){
- STORAGE current = data[index[i]] ;
- if( Rcpp::traits::is_na<RTYPE>(current) ){
- for(int j=i; j<n; j++){
- out[out_index[j]] = current ;
- }
- return ;
- }
- if( current > value ) value = current ;
- out[out_index[i]] = value ;
- }
+// version for REALSXP
+template <int RTYPE>
+class CumMax : public Mutater<RTYPE, CumMax<RTYPE> > {
+public:
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+
+ CumMax(SEXP data_) : data(data_) {}
+
+ void process_slice(Vector<RTYPE>& out, const SlicingIndex& index, const SlicingIndex& out_index) {
+ int n = index.size();
+ STORAGE value = data[index[0]];
+ out[out_index[0]] = value;
+ if (NumericVector::is_na(value)) {
+ for (int i = 1; i < n; i++) {
+ out[out_index[i]] = value;
+ }
+ return;
+ }
+
+ for (int i = 1; i < n; i++) {
+ STORAGE current = data[index[i]];
+ if (Rcpp::traits::is_na<RTYPE>(current)) {
+ for (int j = i; j < n; j++) {
+ out[out_index[j]] = current;
}
-
- Vector<RTYPE> data ;
- } ;
+ return;
+ }
+ if (current > value) value = current;
+ out[out_index[i]] = value;
+ }
+ }
+
+ Vector<RTYPE> data;
+};
}
diff --git a/inst/include/dplyr/Result/CumMin.h b/inst/include/dplyr/Result/CumMin.h
index a312cf8..1ecfc2c 100644
--- a/inst/include/dplyr/Result/CumMin.h
+++ b/inst/include/dplyr/Result/CumMin.h
@@ -1,42 +1,44 @@
#ifndef dplyr_Result_CumMin_H
#define dplyr_Result_CumMin_H
+#include <dplyr/Result/Mutater.h>
+
namespace dplyr {
- // version for REALSXP
- template <int RTYPE>
- class CumMin : public Mutater<RTYPE, CumMin<RTYPE> > {
- public:
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-
- CumMin(SEXP data_) : data(data_){}
-
- void process_slice( Vector<RTYPE>& out, const SlicingIndex& index, const SlicingIndex& out_index){
- int n = index.size() ;
- STORAGE value = data[index[0]] ;
- out[out_index[0]] = value ;
- if( NumericVector::is_na(value) ){
- for( int i=1; i<n; i++){
- out[out_index[i]] = value ;
- }
- return ;
- }
-
- for( int i=1; i<n; i++){
- STORAGE current = data[index[i]] ;
- if( Rcpp::traits::is_na<RTYPE>(current) ){
- for(int j=i; j<n; j++){
- out[out_index[j]] = current ;
- }
- return ;
- }
- if( current < value ) value = current ;
- out[out_index[i]] = value ;
- }
+// version for REALSXP
+template <int RTYPE>
+class CumMin : public Mutater<RTYPE, CumMin<RTYPE> > {
+public:
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+
+ CumMin(SEXP data_) : data(data_) {}
+
+ void process_slice(Vector<RTYPE>& out, const SlicingIndex& index, const SlicingIndex& out_index) {
+ int n = index.size();
+ STORAGE value = data[index[0]];
+ out[out_index[0]] = value;
+ if (NumericVector::is_na(value)) {
+ for (int i = 1; i < n; i++) {
+ out[out_index[i]] = value;
+ }
+ return;
+ }
+
+ for (int i = 1; i < n; i++) {
+ STORAGE current = data[index[i]];
+ if (Rcpp::traits::is_na<RTYPE>(current)) {
+ for (int j = i; j < n; j++) {
+ out[out_index[j]] = current;
}
-
- Vector<RTYPE> data ;
- } ;
+ return;
+ }
+ if (current < value) value = current;
+ out[out_index[i]] = value;
+ }
+ }
+
+ Vector<RTYPE> data;
+};
}
diff --git a/inst/include/dplyr/Result/CumSum.h b/inst/include/dplyr/Result/CumSum.h
index f71dfa1..e1a3c81 100644
--- a/inst/include/dplyr/Result/CumSum.h
+++ b/inst/include/dplyr/Result/CumSum.h
@@ -1,50 +1,52 @@
#ifndef dplyr_Result_CumSum_H
#define dplyr_Result_CumSum_H
-namespace dplyr {
+#include <dplyr/Result/Mutater.h>
- // REALSXP version
- template <int RTYPE>
- class CumSum : public Mutater<RTYPE, CumSum<RTYPE> > {
- public:
- CumSum(SEXP data_) : data(data_){}
-
- void process_slice( Vector<RTYPE>& out, const SlicingIndex& index, const SlicingIndex& out_index){
- double value = 0.0 ;
- int n = index.size() ;
- for( int i=0; i<n; i++){
- value += data[index[i]] ;
- out[out_index[i]] = value ;
- }
- }
+namespace dplyr {
- Vector<RTYPE> data ;
- } ;
-
- // INTSXP version
- template <>
- class CumSum<INTSXP> : public Mutater<INTSXP, CumSum<INTSXP> > {
- public:
- CumSum(SEXP data_) : data(data_){}
-
- void process_slice( IntegerVector& out, const SlicingIndex& index, const SlicingIndex& out_index){
- int value = 0 ;
- int n = index.size() ;
- for( int i=0; i<n; i++){
- int current = data[index[i]] ;
- if( IntegerVector::is_na(current) ){
- for( int j=i; j<n; j++){
- out[ out_index[j] ] = NA_INTEGER ;
- }
- return ;
- }
- value += current ;
- out[out_index[i]] = value ;
- }
+// REALSXP version
+template <int RTYPE>
+class CumSum : public Mutater<RTYPE, CumSum<RTYPE> > {
+public:
+ CumSum(SEXP data_) : data(data_) {}
+
+ void process_slice(Vector<RTYPE>& out, const SlicingIndex& index, const SlicingIndex& out_index) {
+ double value = 0.0;
+ int n = index.size();
+ for (int i = 0; i < n; i++) {
+ value += data[index[i]];
+ out[out_index[i]] = value;
+ }
+ }
+
+ Vector<RTYPE> data;
+};
+
+// INTSXP version
+template <>
+class CumSum<INTSXP> : public Mutater<INTSXP, CumSum<INTSXP> > {
+public:
+ CumSum(SEXP data_) : data(data_) {}
+
+ void process_slice(IntegerVector& out, const SlicingIndex& index, const SlicingIndex& out_index) {
+ int value = 0;
+ int n = index.size();
+ for (int i = 0; i < n; i++) {
+ int current = data[index[i]];
+ if (IntegerVector::is_na(current)) {
+ for (int j = i; j < n; j++) {
+ out[ out_index[j] ] = NA_INTEGER;
}
-
- IntegerVector data ;
- } ;
+ return;
+ }
+ value += current;
+ out[out_index[i]] = value;
+ }
+ }
+
+ IntegerVector data;
+};
}
diff --git a/inst/include/dplyr/Result/DelayedProcessor.h b/inst/include/dplyr/Result/DelayedProcessor.h
index 5d80148..224b3cb 100644
--- a/inst/include/dplyr/Result/DelayedProcessor.h
+++ b/inst/include/dplyr/Result/DelayedProcessor.h
@@ -1,268 +1,287 @@
#ifndef dplyr_Result_DelayedProcessor_H
#define dplyr_Result_DelayedProcessor_H
-namespace dplyr{
-
- template <typename CLASS>
- class DelayedProcessor_Base {
- public:
- DelayedProcessor_Base(){}
- virtual ~DelayedProcessor_Base(){}
-
- virtual bool handled(int i, const RObject& chunk ) = 0 ;
- virtual bool can_promote(const RObject& chunk ) = 0 ;
- virtual DelayedProcessor_Base* promote(int i, const RObject& chunk) = 0 ;
- virtual SEXP get() = 0;
- } ;
-
- template <int RTYPE>
- bool valid_conversion(int rtype){
- return rtype == RTYPE ;
- }
+#include <tools/hash.h>
+#include <tools/ShrinkableVector.h>
+#include <tools/scalar_type.h>
+#include <tools/utils.h>
+#include <dplyr/vector_class.h>
+#include <dplyr/checks.h>
+
+namespace dplyr {
+
+class IDelayedProcessor {
+public:
+ IDelayedProcessor() {}
+ virtual ~IDelayedProcessor() {}
+
+ virtual bool try_handle(const RObject& chunk) = 0;
+ virtual IDelayedProcessor* promote(const RObject& chunk) = 0;
+ virtual SEXP get() = 0;
+ virtual std::string describe() = 0;
+};
+
+template <int RTYPE>
+bool valid_conversion(int rtype) {
+ return rtype == RTYPE;
+}
- template <>
- inline bool valid_conversion<REALSXP>( int rtype ){
- switch( rtype ){
- case REALSXP:
- case INTSXP:
- case LGLSXP:
- return true ;
- default: break ;
- }
- return false ;
- }
+template <>
+inline bool valid_conversion<REALSXP>(int rtype) {
+ switch (rtype) {
+ case REALSXP:
+ case INTSXP:
+ case LGLSXP:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
- template <>
- inline bool valid_conversion<INTSXP>( int rtype ){
- switch( rtype ){
- case INTSXP:
- case LGLSXP:
- return true ;
- default: break ;
- }
- return false ;
- }
+template <>
+inline bool valid_conversion<INTSXP>(int rtype) {
+ switch (rtype) {
+ case INTSXP:
+ case LGLSXP:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
- template <int RTYPE>
- inline bool valid_promotion(int rtype) {
- return false ;
- }
+template <int RTYPE>
+inline bool valid_promotion(int) {
+ return false;
+}
- template <>
- inline bool valid_promotion<INTSXP>( int rtype ){
- return rtype == REALSXP ;
+template <>
+inline bool valid_promotion<INTSXP>(int rtype) {
+ return rtype == REALSXP;
+}
+
+template <>
+inline bool valid_promotion<LGLSXP>(int rtype) {
+ return rtype == REALSXP || rtype == INTSXP;
+}
+
+template <int RTYPE, typename CLASS>
+class DelayedProcessor : public IDelayedProcessor {
+public:
+ typedef typename traits::scalar_type<RTYPE>::type STORAGE;
+ typedef Vector<RTYPE> Vec;
+
+ DelayedProcessor(const RObject& first_result, int ngroups_, const SymbolString& name_) :
+ res(no_init(ngroups_)), pos(0), seen_na_only(true), name(name_)
+ {
+ if (!try_handle(first_result))
+ stop("cannot handle result of type %i for column '%s'", first_result.sexp_type(), name.get_utf8_cstring());
+ copy_most_attributes(res, first_result);
+ }
+
+ DelayedProcessor(int pos_, const RObject& chunk, SEXP res_, const SymbolString& name_) :
+ res(as<Vec>(res_)), pos(pos_), seen_na_only(false), name(name_)
+ {
+ copy_most_attributes(res, chunk);
+ if (!try_handle(chunk)) {
+ stop("cannot handle result of type %i in promotion for column '%s'",
+ chunk.sexp_type(), name.get_utf8_cstring()
+ );
+ }
+ }
+
+ virtual bool try_handle(const RObject& chunk) {
+ check_supported_type(chunk, name);
+ check_length(Rf_length(chunk), 1, "a summary value", name);
+
+ int rtype = TYPEOF(chunk);
+ if (valid_conversion<RTYPE>(rtype)) {
+ // copy, and memoize the copied value
+ const typename Vec::stored_type& converted_chunk = (res[pos++] = as<STORAGE>(chunk));
+ if (!Vec::is_na(converted_chunk))
+ seen_na_only = false;
+ return true;
+ } else {
+ return false;
}
+ }
- template <>
- inline bool valid_promotion<LGLSXP>( int rtype ){
- return rtype == REALSXP || rtype == INTSXP ;
+ virtual IDelayedProcessor* promote(const RObject& chunk) {
+ if (!can_promote(chunk)) {
+ LOG_VERBOSE << "can't promote";
+ return 0;
}
- template <int RTYPE, typename CLASS>
- class DelayedProcessor : public DelayedProcessor_Base<CLASS> {
- public:
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
- typedef Vector<RTYPE> Vec ;
-
- DelayedProcessor( int first_non_na, SEXP first_result, int ngroups_) :
- res( no_init(ngroups_) )
- {
- std::fill( res.begin(), res.begin() + first_non_na, Vec::get_na() );
- res[first_non_na] = as<STORAGE>( first_result ) ;
- copy_most_attributes(res, first_result ) ;
- }
-
- DelayedProcessor( int i, const RObject& chunk, SEXP res_ ) :
- res( as<Vec>( res_ ) )
- {
- copy_most_attributes( res, chunk ) ;
- res[i] = as<STORAGE>(chunk) ;
- }
-
- virtual bool handled(int i, const RObject& chunk ) {
- int rtype = TYPEOF(chunk) ;
- if( valid_conversion<RTYPE>(rtype) ){
- res[i] = as<STORAGE>( chunk ) ;
- return true ;
- } else {
- return false ;
- }
- }
-
- virtual bool can_promote(const RObject& chunk ) {
- return valid_promotion<RTYPE>( TYPEOF(chunk) ) ;
- }
- virtual DelayedProcessor_Base<CLASS>* promote(int i, const RObject& chunk){
- int rtype = TYPEOF(chunk) ;
- switch( rtype ){
- case LGLSXP: return new DelayedProcessor<LGLSXP , CLASS>(i, chunk, res ) ;
- case INTSXP: return new DelayedProcessor<INTSXP , CLASS>(i, chunk, res ) ;
- case REALSXP: return new DelayedProcessor<REALSXP, CLASS>(i, chunk, res ) ;
- case CPLXSXP: return new DelayedProcessor<CPLXSXP, CLASS>(i, chunk, res ) ;
- default: break ;
- }
- return 0 ;
- }
-
- virtual SEXP get() {
- return res ;
- }
-
-
- private:
- Vec res ;
-
-
- } ;
-
- template <typename CLASS>
- class DelayedProcessor<STRSXP, CLASS> : public DelayedProcessor_Base<CLASS> {
- public:
- DelayedProcessor(int first_non_na_, SEXP first_result, int ngroups) :
- res(ngroups)
- {
- res[first_non_na_] = as<String>(first_result) ;
- copy_most_attributes(res, first_result) ;
- }
-
- virtual bool handled(int i, const RObject& chunk ) {
- res[i] = as<String>(chunk) ;
- return true ;
- }
- virtual bool can_promote(const RObject& chunk ) {
- return false ;
- }
- virtual DelayedProcessor_Base<CLASS>* promote(int i, const RObject& chunk) {
- return 0 ;
- }
- virtual SEXP get() {
- return res ;
- }
-
- private:
- CharacterVector res ;
- } ;
-
- template <typename CLASS>
- class FactorDelayedProcessor : public DelayedProcessor_Base<CLASS>{
- private:
- typedef dplyr_hash_map<SEXP,int> LevelsMap ;
-
- public:
-
- FactorDelayedProcessor(int first_non_na, SEXP first_result, int ngroups ) :
- res(ngroups, NA_INTEGER)
- {
- copy_most_attributes( res, first_result ) ;
- CharacterVector levels = Rf_getAttrib( first_result, Rf_install("levels") ) ;
- int n = levels.size() ;
- for( int i=0; i<n; i++) levels_map[ levels[i] ] = i+1 ;
- }
-
- virtual bool handled(int i, const RObject& chunk ) {
- CharacterVector lev = chunk.attr("levels") ;
- update_levels(lev) ;
-
- int val = as<int>(chunk) ;
- if( val == NA_INTEGER){
- return true ;
- }
- SEXP s = lev[val-1] ;
- res[i] = levels_map[s] ;
- return true ;
- }
- virtual bool can_promote(const RObject& chunk ) {
- return false ;
- }
- virtual DelayedProcessor_Base<CLASS>* promote(int i, const RObject& chunk) {
- return 0 ;
- }
- virtual SEXP get() {
- int n = levels_map.size() ;
- CharacterVector levels(n) ;
- LevelsMap::iterator it = levels_map.begin() ;
- for(int i=0; i<n; i++, ++it){
- levels[it->second-1] = it->first ;
- }
- res.attr("class") = "factor" ;
- res.attr("levels") = levels ;
- return res ;
- }
-
- private:
-
- void update_levels( const CharacterVector& lev) {
- int nlevels = levels_map.size() ;
- int n = lev.size() ;
- for(int i=0; i<n; i++) {
- SEXP s = lev[i] ;
- if( ! levels_map.count(s) ) {
- levels_map.insert( std::make_pair(s, ++nlevels) ) ;
- }
- }
- }
-
- IntegerVector res ;
- LevelsMap levels_map ;
- } ;
-
-
-
- template <typename CLASS>
- class DelayedProcessor<VECSXP, CLASS> : public DelayedProcessor_Base<CLASS> {
- public:
- DelayedProcessor(int first_non_na_, SEXP first_result, int ngroups) :
- res(ngroups)
- {
- res[first_non_na_] = maybe_copy(VECTOR_ELT(first_result, 0)) ;
- copy_most_attributes(res, first_result) ;
- }
-
- virtual bool handled(int i, const RObject& chunk ) {
- if( is<List>(chunk) && Rf_length(chunk) == 1){
- res[i] = maybe_copy(VECTOR_ELT(chunk, 0)) ;
- return true ;
- }
- return false ;
- }
- virtual bool can_promote(const RObject& chunk ) {
- return false ;
- }
- virtual DelayedProcessor_Base<CLASS>* promote(int i, const RObject& chunk) {
- return 0 ;
- }
- virtual SEXP get() {
- return res ;
- }
-
- private:
- List res ;
-
- inline SEXP maybe_copy(SEXP x) const {
- return is_ShrinkableVector(x) ? Rf_duplicate(x) : x ;
- }
- } ;
-
- template <typename CLASS>
- DelayedProcessor_Base<CLASS>* get_delayed_processor(int i, SEXP first_result, int ngroups){
- if( Rf_inherits(first_result, "factor") ){
- return new FactorDelayedProcessor<CLASS>(i, first_result, ngroups) ;
- } else if( Rcpp::is<int>( first_result ) ){
- return new DelayedProcessor<INTSXP, CLASS>(i, first_result, ngroups) ;
- } else if( Rcpp::is<double>( first_result) ){
- return new DelayedProcessor<REALSXP, CLASS>(i, first_result, ngroups) ;
- } else if( Rcpp::is<Rcpp::String>( first_result) ){
- return new DelayedProcessor<STRSXP, CLASS>(i, first_result, ngroups) ;
- } else if( Rcpp::is<bool>( first_result) ){
- return new DelayedProcessor<LGLSXP, CLASS>(i, first_result, ngroups) ;
- } else if( Rcpp::is<Rcpp::List>( first_result ) ){
- if( Rf_length(first_result) != 1 ) return 0 ;
- return new DelayedProcessor<VECSXP, CLASS>(i, first_result, ngroups) ;
- } else if( Rf_length(first_result) == 1 && TYPEOF(first_result) == CPLXSXP ){
- return new DelayedProcessor<CPLXSXP, CLASS>(i, first_result, ngroups) ;
- }
- return 0 ;
+ int rtype = TYPEOF(chunk);
+ switch (rtype) {
+ case LGLSXP:
+ return new DelayedProcessor<LGLSXP, CLASS>(pos, chunk, res, name);
+ case INTSXP:
+ return new DelayedProcessor<INTSXP, CLASS>(pos, chunk, res, name);
+ case REALSXP:
+ return new DelayedProcessor<REALSXP, CLASS>(pos, chunk, res, name);
+ case CPLXSXP:
+ return new DelayedProcessor<CPLXSXP, CLASS>(pos, chunk, res, name);
+ case STRSXP:
+ return new DelayedProcessor<STRSXP, CLASS>(pos, chunk, res, name);
+ default:
+ break;
}
+ return 0;
+ }
+
+ virtual SEXP get() {
+ return res;
+ }
+
+ virtual std::string describe() {
+ return vector_class<RTYPE>();
+ }
+
+
+private:
+ bool can_promote(const RObject& chunk) {
+ return seen_na_only || valid_promotion<RTYPE>(TYPEOF(chunk));
+ }
+
+
+private:
+ Vec res;
+ int pos;
+ bool seen_na_only;
+ const SymbolString name;
+
+};
+
+template <typename CLASS>
+class FactorDelayedProcessor : public IDelayedProcessor {
+private:
+ typedef dplyr_hash_map<SEXP, int> LevelsMap;
+
+public:
+
+ FactorDelayedProcessor(SEXP first_result, int ngroups, const SymbolString& name_) :
+ res(no_init(ngroups)), pos(0), name(name_)
+ {
+ copy_most_attributes(res, first_result);
+ CharacterVector levels = get_levels(first_result);
+ int n = levels.size();
+ for (int i = 0; i < n; i++) levels_map[ levels[i] ] = i + 1;
+ if (!try_handle(first_result))
+ stop("cannot handle factor result for column '%s'", name.get_utf8_cstring());
+ }
+
+ virtual bool try_handle(const RObject& chunk) {
+ CharacterVector lev = get_levels(chunk);
+ update_levels(lev);
+
+ int val = as<int>(chunk);
+ if (val != NA_INTEGER) val = levels_map[lev[val - 1]];
+ res[pos++] = val;
+ return true;
+ }
+
+ virtual IDelayedProcessor* promote(const RObject&) {
+ return 0;
+ }
+
+ virtual SEXP get() {
+ int n = levels_map.size();
+ CharacterVector levels(n);
+ LevelsMap::iterator it = levels_map.begin();
+ for (int i = 0; i < n; i++, ++it) {
+ levels[it->second - 1] = it->first;
+ }
+ set_levels(res, levels);
+ return res;
+ }
+
+ virtual std::string describe() {
+ return "factor";
+ }
+
+private:
+
+ void update_levels(const CharacterVector& lev) {
+ int nlevels = levels_map.size();
+ int n = lev.size();
+ for (int i = 0; i < n; i++) {
+ SEXP s = lev[i];
+ if (! levels_map.count(s)) {
+ levels_map.insert(std::make_pair(s, ++nlevels));
+ }
+ }
+ }
+
+ IntegerVector res;
+ int pos;
+ LevelsMap levels_map;
+ const SymbolString name;
+};
+
+
+
+template <typename CLASS>
+class DelayedProcessor<VECSXP, CLASS> : public IDelayedProcessor {
+public:
+ DelayedProcessor(SEXP first_result, int ngroups, const SymbolString& name_) :
+ res(ngroups), pos(0), name(name_)
+ {
+ copy_most_attributes(res, first_result);
+ if (!try_handle(first_result))
+ stop("cannot handle list result for column '%s'", name.get_utf8_cstring());
+ }
+
+ virtual bool try_handle(const RObject& chunk) {
+ if (is<List>(chunk) && Rf_length(chunk) == 1) {
+ res[pos++] = Rf_duplicate(VECTOR_ELT(chunk, 0));
+ return true;
+ }
+ return false;
+ }
+
+ virtual IDelayedProcessor* promote(const RObject&) {
+ return 0;
+ }
+
+ virtual SEXP get() {
+ return res;
+ }
+
+ virtual std::string describe() {
+ return "list";
+ }
+
+private:
+ List res;
+ int pos;
+ const SymbolString name;
+};
+
+template <typename CLASS>
+IDelayedProcessor* get_delayed_processor(SEXP first_result, int ngroups, const SymbolString& name) {
+ check_supported_type(first_result, name);
+ check_length(Rf_length(first_result), 1, "a summary value", name);
+
+ if (Rf_inherits(first_result, "factor")) {
+ return new FactorDelayedProcessor<CLASS>(first_result, ngroups, name);
+ } else if (Rcpp::is<int>(first_result)) {
+ return new DelayedProcessor<INTSXP, CLASS>(first_result, ngroups, name);
+ } else if (Rcpp::is<double>(first_result)) {
+ return new DelayedProcessor<REALSXP, CLASS>(first_result, ngroups, name);
+ } else if (Rcpp::is<Rcpp::String>(first_result)) {
+ return new DelayedProcessor<STRSXP, CLASS>(first_result, ngroups, name);
+ } else if (Rcpp::is<bool>(first_result)) {
+ return new DelayedProcessor<LGLSXP, CLASS>(first_result, ngroups, name);
+ } else if (Rcpp::is<Rcpp::List>(first_result)) {
+ return new DelayedProcessor<VECSXP, CLASS>(first_result, ngroups, name);
+ } else if (TYPEOF(first_result) == CPLXSXP) {
+ return new DelayedProcessor<CPLXSXP, CLASS>(first_result, ngroups, name);
+ }
+
+ stop("unknown result of type %d for column '%s'", TYPEOF(first_result), name.get_utf8_cstring());
+}
}
#endif
diff --git a/inst/include/dplyr/Result/DelayedReducer.h b/inst/include/dplyr/Result/DelayedReducer.h
deleted file mode 100644
index 8177bea..0000000
--- a/inst/include/dplyr/Result/DelayedReducer.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef dplyr_DelayedReducer_H
-#define dplyr_DelayedReducer_H
-
-namespace dplyr {
-
- template <int INPUT_RTYPE>
- class DelayedReducer : public CallbackProcessor< DelayedReducer<INPUT_RTYPE> > {
- public:
- DelayedReducer(Rcpp::Function fun_, Rcpp::String variable_, SEXP data_ ):
- call(fun_), proxy(call, 1), data(data_) {}
-
- virtual ~DelayedReducer(){} ;
-
- inline SEXP process_chunk( const SlicingIndex& indices){
- proxy = wrap_subset<INPUT_RTYPE>( data, indices );
- return call.eval() ;
- }
-
- private:
-
- Rcpp::Language call ;
- Rcpp::Language::Proxy proxy ;
- SEXP data ;
- } ;
-
-} // namespace dplyr
-
-#endif
diff --git a/inst/include/dplyr/Result/Everything.h b/inst/include/dplyr/Result/Everything.h
deleted file mode 100644
index 3560f35..0000000
--- a/inst/include/dplyr/Result/Everything.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef dplyr_Result_Everything_H
-#define dplyr_Result_Everything_H
-
-namespace dplyr{
- struct Everything{} ;
-}
-#endif
diff --git a/inst/include/dplyr/Result/GroupedCallProxy.h b/inst/include/dplyr/Result/GroupedCallProxy.h
index e92a355..df4b960 100644
--- a/inst/include/dplyr/Result/GroupedCallProxy.h
+++ b/inst/include/dplyr/Result/GroupedCallProxy.h
@@ -1,203 +1,103 @@
#ifndef dplyr_GroupedCallProxy_H
#define dplyr_GroupedCallProxy_H
+#include <dplyr/Hybrid.h>
+
+#include <dplyr/Result/CallElementProxy.h>
+#include <dplyr/Result/LazyGroupedSubsets.h>
+#include <dplyr/Result/ILazySubsets.h>
+#include <dplyr/Result/GroupedHybridCall.h>
+
namespace dplyr {
- template <typename Data = GroupedDataFrame, typename Subsets = LazyGroupedSubsets>
- class GroupedCallProxy {
- public:
- typedef GroupedHybridCall<Subsets> HybridCall ;
-
- GroupedCallProxy( Call call_, const Subsets& subsets_, const Environment& env_) :
- call(call_), subsets(subsets_), proxies(), env(env_)
- {
- set_call(call) ;
- }
-
- GroupedCallProxy( Call call_, const Data& data_, const Environment& env_) :
- call(call_), subsets(data_), proxies(), env(env_)
- {
- set_call(call) ;
- }
-
- GroupedCallProxy( const Data& data_, const Environment& env_ ) :
- subsets(data_), proxies(), env(env_)
- {}
-
- GroupedCallProxy( const Data& data_) :
- subsets(data_), proxies()
- {}
-
- ~GroupedCallProxy(){}
-
- template <typename Container>
- SEXP get(const Container& indices){
- subsets.clear();
-
- if( TYPEOF(call) == LANGSXP){
- if( can_simplify(call) ) {
- HybridCall hybrid_eval( call, indices, subsets, env ) ;
- return hybrid_eval.eval() ;
- }
- int n = proxies.size() ;
- for( int i=0; i<n; i++){
- proxies[i].set( subsets.get(proxies[i].symbol, indices ) ) ;
- }
-
- return call.eval(env) ;
- } else if( TYPEOF(call) == SYMSXP ) {
- if(subsets.count(call)){
- return subsets.get(call, indices) ;
- }
- return env.find( CHAR(PRINTNAME(call)) ) ;
- } else {
- // all other types that evaluate to themselves
- return call ;
- }
- }
-
- void set_call( SEXP call_ ){
- proxies.clear() ;
- call = call_ ;
- if( TYPEOF(call) == LANGSXP ) traverse_call(call) ;
- }
-
- void input( Rcpp::String name, SEXP x ){
- subsets.input( Rf_installChar(name.get_sexp()) , x ) ;
- }
-
- inline int nsubsets(){
- return subsets.size() ;
- }
-
- inline SEXP get_variable( Rcpp::String name ) const {
- return subsets.get_variable(Rf_installChar(name.get_sexp()) ) ;
- }
-
- inline bool is_constant() const {
- return TYPEOF(call) != LANGSXP && Rf_length(call) == 1 ;
- }
-
- inline SEXP get_call() const {
- return call ;
- }
-
- inline bool has_variable(SEXP symbol) const {
- return subsets.count(symbol);
- }
-
- inline void set_env(SEXP env_){
- env = env_ ;
- }
-
- private:
-
- void traverse_call( SEXP obj ){
- if( TYPEOF(obj) == LANGSXP && CAR(obj) == Rf_install("local") ) return ;
-
- if( TYPEOF(obj) == LANGSXP && CAR(obj) == Rf_install("global") ){
- SEXP symb = CADR(obj) ;
- if( TYPEOF(symb) != SYMSXP ) stop( "global only handles symbols" ) ;
- SEXP res = env.find(CHAR(PRINTNAME(symb))) ;
- call = res ;
- return ;
- }
-
- if( TYPEOF(obj) == LANGSXP && CAR(obj) == Rf_install("column") ){
- call = get_column(CADR(obj), env, subsets) ;
- return ;
- }
-
- if( ! Rf_isNull(obj) ){
- SEXP head = CAR(obj) ;
-
- switch( TYPEOF( head ) ){
- case LANGSXP:
- if( CAR(head) == Rf_install("global") ){
- SEXP symb = CADR(head) ;
- if( TYPEOF(symb) != SYMSXP ) stop( "global only handles symbols" ) ;
-
- SEXP res = env.find( CHAR(PRINTNAME(symb)) ) ;
-
- SETCAR(obj, res) ;
- SET_TYPEOF(obj, LISTSXP) ;
- break ;
- }
- if( CAR(head) == Rf_install("column")){
- Symbol column = get_column( CADR(head), env, subsets) ;
- SETCAR(obj, column ) ;
- head = CAR(obj) ;
- proxies.push_back( CallElementProxy( head, obj ) );
- break ;
- }
-
- if( CAR(head) == Rf_install("~") ) break ;
- if( CAR(head) == Rf_install("order_by") ) break ;
- if( CAR(head) == Rf_install("function") ) break ;
- if( CAR(head) == Rf_install("local") ) return ;
- if( CAR(head) == Rf_install("<-") ){
- stop( "assignments are forbidden" ) ;
- }
-
- if( Rf_length(head) == 3 ){
- SEXP symb = CAR(head) ;
- if( symb == R_DollarSymbol || symb == Rf_install("@") || symb == Rf_install("::") || symb == Rf_install(":::") ){
-
- // for things like : foo( bar = bling )$bla
- // so that `foo( bar = bling )` gets processed
- if( TYPEOF(CADR(head)) == LANGSXP ){
- traverse_call( CDR(head) ) ;
- }
-
- // deal with foo$bar( bla = boom )
- if( TYPEOF(CADDR(head)) == LANGSXP ){
- traverse_call( CDDR(head) ) ;
- }
-
- break ;
- }
- }
- traverse_call( CDR(head) ) ;
- break ;
- case LISTSXP:
- traverse_call( head ) ;
- traverse_call( CDR(head) ) ;
- break ;
-
- case SYMSXP:
- if( TYPEOF(obj) != LANGSXP ){
- if( ! subsets.count(head) ){
-
- // in the Environment -> resolve
- try{
- if( head == R_MissingArg ) break ;
- if( head == Rf_install(".") ) break ;
-
- Shield<SEXP> x( env.find( CHAR(PRINTNAME(head)) ) ) ;
- SETCAR( obj, x );
- } catch(...){
- // when the binding is not found in the environment
- // e.g. summary(mod)$r.squared
- // the "r.squared" is not in the env
- }
- } else {
- // in the data frame
- proxies.push_back( CallElementProxy( head, obj ) );
- }
- }
- break ;
- }
-
- traverse_call( CDR(obj) ) ;
- }
- }
-
- Rcpp::Call call ;
- Subsets subsets ;
- std::vector<CallElementProxy> proxies ;
- Environment env;
-
- } ;
+template <typename Data = GroupedDataFrame, typename Subsets = LazyGroupedSubsets>
+class GroupedCallProxy {
+public:
+ GroupedCallProxy(const Rcpp::Call& call_, const Subsets& subsets_, const Environment& env_) :
+ subsets(subsets_), proxies()
+ {
+ set_call(call_);
+ set_env(env_);
+ }
+
+ GroupedCallProxy(const Rcpp::Call& call_, const Data& data_, const Environment& env_) :
+ subsets(data_), proxies()
+ {
+ set_call(call_);
+ set_env(env_);
+ }
+
+ GroupedCallProxy(const Data& data_, const Environment& env_) :
+ subsets(data_), proxies()
+ {
+ set_env(env_);
+ }
+
+ GroupedCallProxy(const Data& data_) :
+ subsets(data_), proxies()
+ {}
+
+ ~GroupedCallProxy() {}
+
+public:
+ SEXP eval() {
+ return get(NaturalSlicingIndex(subsets.nrows()));
+ }
+
+ SEXP get(const SlicingIndex& indices) {
+ subsets.clear();
+
+ return get_hybrid_eval()->eval(indices);
+ }
+
+ GroupedHybridEval* get_hybrid_eval() {
+ if (!hybrid_eval) {
+ hybrid_eval.reset(new GroupedHybridEval(call, subsets, env));
+ }
+
+ return hybrid_eval.get();
+ }
+
+ void set_call(SEXP call_) {
+ proxies.clear();
+ hybrid_eval.reset();
+ call = call_;
+ }
+
+ inline void set_env(SEXP env_) {
+ env = env_;
+ hybrid_eval.reset();
+ }
+
+ void input(const SymbolString& name, SEXP x) {
+ subsets.input(name, x);
+ hybrid_eval.reset();
+ }
+
+ inline int nsubsets() const {
+ return subsets.size();
+ }
+
+ inline bool has_variable(const SymbolString& name) const {
+ return subsets.has_variable(name);
+ }
+
+ inline SEXP get_variable(const SymbolString& name) const {
+ return subsets.get_variable(name);
+ }
+
+ inline bool is_constant() const {
+ return TYPEOF(call) != LANGSXP && Rf_length(call) == 1;
+ }
+
+private:
+ Rcpp::Call call;
+ Subsets subsets;
+ std::vector<CallElementProxy> proxies;
+ Environment env;
+ boost::scoped_ptr<GroupedHybridEval> hybrid_eval;
+
+};
}
diff --git a/inst/include/dplyr/Result/GroupedCallReducer.h b/inst/include/dplyr/Result/GroupedCallReducer.h
index b01d4d3..f3d3f2f 100644
--- a/inst/include/dplyr/Result/GroupedCallReducer.h
+++ b/inst/include/dplyr/Result/GroupedCallReducer.h
@@ -1,25 +1,36 @@
#ifndef dplyr_GroupedCallReducer_H
#define dplyr_GroupedCallReducer_H
+#include <tools/Call.h>
+
+#include <dplyr/Result/CallbackProcessor.h>
+#include <dplyr/Result/GroupedCallProxy.h>
+
namespace dplyr {
- template <typename Data, typename Subsets>
- class GroupedCallReducer : public CallbackProcessor< GroupedCallReducer<Data,Subsets> > {
- public:
- GroupedCallReducer(Rcpp::Call call, const Subsets& subsets, const Environment& env):
- proxy(call, subsets, env)
- {
- }
+template <typename Data, typename Subsets>
+class GroupedCallReducer : public CallbackProcessor< GroupedCallReducer<Data, Subsets> > {
+public:
+ GroupedCallReducer(Rcpp::Call call, const Subsets& subsets, const Environment& env, const SymbolString& name_) :
+ proxy(call, subsets, env),
+ name(name_)
+ {
+ }
+
+ virtual ~GroupedCallReducer() {};
- virtual ~GroupedCallReducer(){} ;
+ inline SEXP process_chunk(const SlicingIndex& indices) {
+ return proxy.get(indices);
+ }
- inline SEXP process_chunk( const SlicingIndex& indices ){
- return proxy.get(indices) ;
- }
+ const SymbolString& get_name() const {
+ return name;
+ }
- private:
- GroupedCallProxy<Data, Subsets> proxy ;
- } ;
+private:
+ GroupedCallProxy<Data, Subsets> proxy;
+ const SymbolString name;
+};
} // namespace dplyr
diff --git a/inst/include/dplyr/Result/GroupedHybridCall.h b/inst/include/dplyr/Result/GroupedHybridCall.h
index c531c29..f31ddbb 100644
--- a/inst/include/dplyr/Result/GroupedHybridCall.h
+++ b/inst/include/dplyr/Result/GroupedHybridCall.h
@@ -1,112 +1,241 @@
#ifndef dplyr_GroupedHybridCall_H
#define dplyr_GroupedHybridCall_H
-namespace dplyr {
+#include <boost/scoped_ptr.hpp>
+
+#include <tools/Call.h>
+
+#include <dplyr/Result/Result.h>
+
+#include <bindrcpp.h>
- template <typename Subsets>
- class GroupedHybridCall {
- public:
- GroupedHybridCall( const Call& call_, const SlicingIndex& indices_, Subsets& subsets_, const Environment& env_ ) :
- call( clone(call_) ), indices(indices_), subsets(subsets_), env(env_)
- {
- while( simplified() ){}
- }
-
- SEXP eval(){
- if( TYPEOF(call) == LANGSXP ){
- substitute(call) ;
- return Rcpp_eval( call, env ) ;
- } else if(TYPEOF(call) == SYMSXP) {
- if(subsets.count(call)){
- return subsets.get(call, indices) ;
- }
- return env.find( CHAR(PRINTNAME(call)) ) ;
- }
- return call ;
- }
-
- private:
-
- void substitute( SEXP obj){
- if( ! Rf_isNull(obj) ){
- SEXP head = CAR(obj) ;
- switch( TYPEOF( head ) ){
- case LISTSXP:
- substitute( CDR(head) ) ;
- break ;
-
- case LANGSXP:
- {
- SEXP symb = CAR(head) ;
- if( symb == R_DollarSymbol || symb == Rf_install("@") || symb == Rf_install("::") || symb == Rf_install(":::") ){
-
- if( TYPEOF(CADR(head)) == LANGSXP ){
- substitute( CDR(head) ) ;
- }
-
- // deal with foo$bar( bla = boom )
- if( TYPEOF(CADDR(head)) == LANGSXP ){
- substitute( CDDR(head) ) ;
- }
-
- break ;
- }
-
- substitute( CDR(head) ) ;
- break ;
- }
- case SYMSXP:
- if( TYPEOF(obj) != LANGSXP ){
- if( subsets.count(head) ){
- SETCAR(obj, subsets.get(head, indices) ) ;
- }
- }
- break ;
- }
- substitute( CDR(obj) ) ;
- }
- }
-
- bool simplified(){
- // initial
- if( TYPEOF(call) == LANGSXP ){
- boost::scoped_ptr<Result> res( get_handler(call, subsets, env) );
- if( res ){
- // replace the call by the result of process
- call = res->process(indices) ;
-
- // no need to go any further, we simplified the top level
- return true ;
- }
- return replace( CDR(call) ) ;
- }
- return false ;
- }
-
- bool replace( SEXP p ){
- SEXP obj = CAR(p) ;
- if( TYPEOF(obj) == LANGSXP ){
- boost::scoped_ptr<Result> res( get_handler(obj, subsets, env) );
- if(res){
- SETCAR(p, res->process(indices) ) ;
- return true ;
- }
-
- if( replace( CDR(obj) ) ) return true ;
- }
-
- if( TYPEOF(p) == LISTSXP ){
- return replace( CDR(p) ) ;
- }
-
- return false ;
- }
-
- Call call ;
- const SlicingIndex& indices ;
- Subsets& subsets ;
- const Environment& env ;
- } ;
+namespace dplyr {
+inline static
+SEXP rlang_object(const char* name) {
+ static Environment rlang = Rcpp::Environment::namespace_env("rlang");
+ return rlang[name];
}
+
+
+class IHybridCallback {
+protected:
+ virtual ~IHybridCallback() {}
+
+public:
+ virtual SEXP get_subset(const SymbolString& name) const = 0;
+};
+
+
+class GroupedHybridEnv {
+public:
+ GroupedHybridEnv(const CharacterVector& names_, const Environment& env_, const IHybridCallback* callback_) :
+ names(names_), env(env_), callback(callback_), has_overscope(false)
+ {
+ LOG_VERBOSE;
+ }
+
+ ~GroupedHybridEnv() {
+ if (has_overscope) {
+ static Function overscope_clean = rlang_object("overscope_clean");
+ overscope_clean(overscope);
+ }
+ }
+
+public:
+ const Environment& get_overscope() const {
+ provide_overscope();
+ return overscope;
+ }
+
+private:
+ void provide_overscope() const {
+ if (has_overscope)
+ return;
+
+ // Environment::new_child() performs an R callback, creating the environment
+ // in R should be slightly faster
+ Environment active_env =
+ create_env_string(
+ names, &GroupedHybridEnv::hybrid_get_callback,
+ PAYLOAD(const_cast<void*>(reinterpret_cast<const void*>(callback))), env);
+
+ // If bindr (via bindrcpp) supported the creation of a child environment, we could save the
+ // call to Rcpp_eval() triggered by active_env.new_child()
+ Environment bottom = active_env.new_child(true);
+ bottom[".data"] = rlang_new_data_source(active_env);
+
+ // Install definitions for formula self-evaluation and unguarding
+ Function new_overscope = rlang_object("new_overscope");
+ overscope = new_overscope(bottom, active_env, env);
+
+ has_overscope = true;
+ }
+
+ static List rlang_new_data_source(Environment env) {
+ static Function as_dictionary = rlang_object("as_dictionary");
+ return
+ as_dictionary(
+ env,
+ _["lookup_msg"] = "Column `%s`: not found in data",
+ _["read_only"] = true
+ );
+ }
+
+ static SEXP hybrid_get_callback(const String& name, bindrcpp::PAYLOAD payload) {
+ LOG_VERBOSE;
+ IHybridCallback* callback_ = reinterpret_cast<IHybridCallback*>(payload.p);
+ return callback_->get_subset(SymbolString(name));
+ }
+
+private:
+ const CharacterVector names;
+ const Environment env;
+ const IHybridCallback* callback;
+
+ mutable Environment overscope;
+ mutable bool has_overscope;
+};
+
+
+class GroupedHybridCall {
+public:
+ GroupedHybridCall(const Call& call_, const ILazySubsets& subsets_, const Environment& env_) :
+ original_call(call_), subsets(subsets_), env(env_)
+ {
+ LOG_VERBOSE;
+ }
+
+public:
+ // FIXME: replace the search & replace logic with overscoping
+ Call simplify(const SlicingIndex& indices) const {
+ set_indices(indices);
+ Call call = clone(original_call);
+ while (simplified(call)) {}
+ clear_indices();
+ return call;
+ }
+
+private:
+ bool simplified(Call& call) const {
+ LOG_VERBOSE;
+ // initial
+ if (TYPEOF(call) == LANGSXP || TYPEOF(call) == SYMSXP) {
+ boost::scoped_ptr<Result> res(get_handler(call, subsets, env));
+ if (res) {
+ // replace the call by the result of process
+ call = res->process(get_indices());
+
+ // no need to go any further, we simplified the top level
+ return true;
+ }
+ if (TYPEOF(call) == LANGSXP)
+ return replace(CDR(call));
+ }
+ return false;
+ }
+
+ bool replace(SEXP p) const {
+ LOG_VERBOSE;
+ SEXP obj = CAR(p);
+ if (TYPEOF(obj) == LANGSXP) {
+ boost::scoped_ptr<Result> res(get_handler(obj, subsets, env));
+ if (res) {
+ SETCAR(p, res->process(get_indices()));
+ return true;
+ }
+
+ if (replace(CDR(obj))) return true;
+ }
+
+ if (TYPEOF(p) == LISTSXP) {
+ return replace(CDR(p));
+ }
+
+ return false;
+ }
+
+ const SlicingIndex& get_indices() const {
+ return *indices;
+ }
+
+ void set_indices(const SlicingIndex& indices_) const {
+ indices = &indices_;
+ }
+
+ void clear_indices() const {
+ indices = NULL;
+ }
+
+private:
+ // Initialization
+ const Call original_call;
+ const ILazySubsets& subsets;
+ const Environment env;
+
+private:
+ // State
+ mutable const SlicingIndex* indices;
+};
+
+
+class GroupedHybridEval : public IHybridCallback {
+public:
+ GroupedHybridEval(const Call& call_, const ILazySubsets& subsets_, const Environment& env_) :
+ indices(NULL), subsets(subsets_), env(env_),
+ hybrid_env(subsets_.get_variable_names().get_vector(), env_, this),
+ hybrid_call(call_, subsets_, env_)
+ {
+ LOG_VERBOSE;
+ }
+
+ const SlicingIndex& get_indices() const {
+ return *indices;
+ }
+
+public: // IHybridCallback
+ SEXP get_subset(const SymbolString& name) const {
+ LOG_VERBOSE;
+ return subsets.get(name, get_indices());
+ }
+
+public:
+ SEXP eval(const SlicingIndex& indices_) {
+ set_indices(indices_);
+ SEXP ret = eval_with_indices();
+ clear_indices();
+ return ret;
+ }
+
+private:
+ void set_indices(const SlicingIndex& indices_) {
+ indices = &indices_;
+ }
+
+ void clear_indices() {
+ indices = NULL;
+ }
+
+ SEXP eval_with_indices() {
+ Call call = hybrid_call.simplify(get_indices());
+ LOG_INFO << type2name(call);
+
+ if (TYPEOF(call) == LANGSXP || TYPEOF(call) == SYMSXP) {
+ LOG_VERBOSE << "performing evaluation in overscope";
+ return Rcpp_eval(call, hybrid_env.get_overscope());
+ }
+ return call;
+ }
+
+private:
+ const SlicingIndex* indices;
+ const ILazySubsets& subsets;
+ Environment env;
+ const GroupedHybridEnv hybrid_env;
+ const GroupedHybridCall hybrid_call;
+};
+
+
+} // namespace dplyr
+
#endif
diff --git a/inst/include/dplyr/Result/GroupedSubset.h b/inst/include/dplyr/Result/GroupedSubset.h
index 331f179..684ff54 100644
--- a/inst/include/dplyr/Result/GroupedSubset.h
+++ b/inst/include/dplyr/Result/GroupedSubset.h
@@ -1,140 +1,136 @@
#ifndef dplyr_GroupedSubset_H
#define dplyr_GroupedSubset_H
+#include <tools/ShrinkableVector.h>
+
+#include <dplyr/DataFrameSubsetVisitors.h>
+#include <dplyr/SummarisedVariable.h>
+#include <dplyr/Result/GroupedSubsetBase.h>
+
namespace dplyr {
- class GroupedSubset {
- public:
- GroupedSubset(){} ;
- virtual ~GroupedSubset(){} ;
- virtual SEXP get( const SlicingIndex& indices ) = 0 ;
- virtual SEXP get_variable() const = 0 ;
- virtual bool is_summary() const = 0;
- } ;
-
- template <int RTYPE>
- class GroupedSubsetTemplate : public GroupedSubset {
- public:
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
- GroupedSubsetTemplate( SEXP x, int max_size ) :
- object(x), output(max_size, object), start( Rcpp::internal::r_vector_start<RTYPE>(object) ) {}
-
- virtual SEXP get( const SlicingIndex& indices ) {
- output.borrow( indices, start ) ;
- return output ;
- }
- virtual SEXP get_variable() const {
- return object ;
- }
- virtual bool is_summary() const {
- return false;
- }
-
- private:
- SEXP object ;
- ShrinkableVector<RTYPE> output ;
- STORAGE* start ;
- } ;
-
- class DataFrameGroupedSubset : public GroupedSubset {
- public:
- DataFrameGroupedSubset( SEXP x ) : data(x), visitors(data){}
-
- virtual SEXP get( const SlicingIndex& indices ) {
- return visitors.subset(indices, data.attr("class") ) ;
- }
-
- virtual SEXP get_variable() const {
- return data ;
- }
-
- virtual bool is_summary() const {
- return false ;
- }
-
- private:
- DataFrame data ;
- DataFrameSubsetVisitors visitors ;
- } ;
-
- inline GroupedSubset* grouped_subset(SEXP x, int max_size){
- switch( TYPEOF(x) ){
- case INTSXP: return new GroupedSubsetTemplate<INTSXP>(x, max_size) ;
- case REALSXP: return new GroupedSubsetTemplate<REALSXP>(x, max_size) ;
- case LGLSXP: return new GroupedSubsetTemplate<LGLSXP>(x, max_size) ;
- case STRSXP: return new GroupedSubsetTemplate<STRSXP>(x, max_size) ;
- case VECSXP:
- if( Rf_inherits( x, "data.frame" ) )
- return new DataFrameGroupedSubset(x) ;
- if( Rf_inherits( x, "POSIXlt" ) ) {
- stop( "POSIXlt not supported" ) ;
- }
- return new GroupedSubsetTemplate<VECSXP>(x, max_size) ;
- case CPLXSXP: return new GroupedSubsetTemplate<CPLXSXP>(x, max_size) ;
- default: break ;
- }
- stop("Unsupported vector type %s", Rf_type2char(TYPEOF(x))) ;
- return 0 ;
+template <int RTYPE>
+class GroupedSubsetTemplate : public GroupedSubset {
+public:
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+ GroupedSubsetTemplate(SEXP x, int max_size) :
+ object(x), output(max_size, object), start(Rcpp::internal::r_vector_start<RTYPE>(object)) {}
+
+ virtual SEXP get(const SlicingIndex& indices) {
+ output.borrow(indices, start);
+ return output;
+ }
+ virtual SEXP get_variable() const {
+ return object;
+ }
+ virtual bool is_summary() const {
+ return false;
+ }
+
+private:
+ SEXP object;
+ ShrinkableVector<RTYPE> output;
+ STORAGE* start;
+};
+
+class DataFrameGroupedSubset : public GroupedSubset {
+public:
+ DataFrameGroupedSubset(SEXP x) : data(x), visitors(data) {}
+
+ virtual SEXP get(const SlicingIndex& indices) {
+ return visitors.subset(indices, get_class(data));
+ }
+
+ virtual SEXP get_variable() const {
+ return data;
+ }
+
+ virtual bool is_summary() const {
+ return false;
+ }
+
+private:
+ DataFrame data;
+ DataFrameSubsetVisitors visitors;
+};
+
+inline GroupedSubset* grouped_subset(SEXP x, int max_size) {
+ switch (TYPEOF(x)) {
+ case INTSXP:
+ return new GroupedSubsetTemplate<INTSXP>(x, max_size);
+ case REALSXP:
+ return new GroupedSubsetTemplate<REALSXP>(x, max_size);
+ case LGLSXP:
+ return new GroupedSubsetTemplate<LGLSXP>(x, max_size);
+ case STRSXP:
+ return new GroupedSubsetTemplate<STRSXP>(x, max_size);
+ case VECSXP:
+ if (Rf_inherits(x, "data.frame"))
+ return new DataFrameGroupedSubset(x);
+ if (Rf_inherits(x, "POSIXlt")) {
+ stop("POSIXlt not supported");
}
+ return new GroupedSubsetTemplate<VECSXP>(x, max_size);
+ case CPLXSXP:
+ return new GroupedSubsetTemplate<CPLXSXP>(x, max_size);
+ default:
+ break;
+ }
+ stop("is of unsupported type %s", Rf_type2char(TYPEOF(x)));
+}
- template <int RTYPE>
- class SummarisedSubsetTemplate : public GroupedSubset {
- public:
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-
- SummarisedSubsetTemplate( SummarisedVariable x, int max_size ) :
- object(x), output(1) {}
-
- virtual SEXP get( const SlicingIndex& indices ) {
- output[0] = object[indices.group()] ;
- return output ;
- }
- virtual SEXP get_variable() const {
- return object ;
- }
- virtual bool is_summary() const {
- return true;
- }
-
- private:
- Rcpp::Vector<RTYPE> object ;
- Rcpp::Vector<RTYPE> output ;
- } ;
-
- template <>
- class SummarisedSubsetTemplate<VECSXP> : public GroupedSubset {
- public:
- SummarisedSubsetTemplate( SummarisedVariable x, int max_size ) : object(x){}
-
- virtual SEXP get( const SlicingIndex& indices ) {
- return List::create( object[indices.group()] ) ;
- }
-
- virtual SEXP get_variable() const {
- return object ;
- }
- virtual bool is_summary() const {
- return true;
- }
-
- private:
- List object ;
- } ;
-
- inline GroupedSubset* summarised_grouped_subset(SummarisedVariable x, int max_size){
- switch( TYPEOF(x) ){
- case LGLSXP: return new SummarisedSubsetTemplate<LGLSXP>(x, max_size) ;
- case INTSXP: return new SummarisedSubsetTemplate<INTSXP>(x, max_size) ;
- case REALSXP: return new SummarisedSubsetTemplate<REALSXP>(x, max_size) ;
- case STRSXP: return new SummarisedSubsetTemplate<STRSXP>(x, max_size) ;
- case VECSXP: return new SummarisedSubsetTemplate<VECSXP>(x, max_size) ;
- case CPLXSXP: return new SummarisedSubsetTemplate<CPLXSXP>(x, max_size) ;
- default: break ;
- }
- stop("Unsupported vector type %s", Rf_type2char(TYPEOF(x))) ;
- return 0 ;
- }
+template <int RTYPE>
+class SummarisedSubsetTemplate : public GroupedSubset {
+public:
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+
+ SummarisedSubsetTemplate(SummarisedVariable x) :
+ object(x), output(1)
+ {
+ copy_most_attributes(output, object);
+ }
+
+ virtual SEXP get(const SlicingIndex& indices) {
+ output[0] = object[indices.group()];
+ return output;
+ }
+ virtual SEXP get_variable() const {
+ return object;
+ }
+ virtual bool is_summary() const {
+ return true;
+ }
+
+private:
+ Rcpp::Vector<RTYPE> object;
+ Rcpp::Vector<RTYPE> output;
+};
+
+template <>
+inline SEXP SummarisedSubsetTemplate<VECSXP>::get(const SlicingIndex& indices) {
+ return List::create(object[indices.group()]);
+}
+
+inline GroupedSubset* summarised_subset(SummarisedVariable x) {
+ switch (TYPEOF(x)) {
+ case LGLSXP:
+ return new SummarisedSubsetTemplate<LGLSXP>(x);
+ case INTSXP:
+ return new SummarisedSubsetTemplate<INTSXP>(x);
+ case REALSXP:
+ return new SummarisedSubsetTemplate<REALSXP>(x);
+ case STRSXP:
+ return new SummarisedSubsetTemplate<STRSXP>(x);
+ case VECSXP:
+ return new SummarisedSubsetTemplate<VECSXP>(x);
+ case CPLXSXP:
+ return new SummarisedSubsetTemplate<CPLXSXP>(x);
+ default:
+ break;
+ }
+ stop("is of unsupported type %s", Rf_type2char(TYPEOF(x)));
+}
}
#endif
diff --git a/inst/include/dplyr/Result/GroupedSubsetBase.h b/inst/include/dplyr/Result/GroupedSubsetBase.h
new file mode 100644
index 0000000..da2db59
--- /dev/null
+++ b/inst/include/dplyr/Result/GroupedSubsetBase.h
@@ -0,0 +1,21 @@
+#ifndef dplyr_GroupedSubsetBase_H
+#define dplyr_GroupedSubsetBase_H
+
+#include <tools/SlicingIndex.h>
+
+namespace dplyr {
+
+class GroupedSubset {
+public:
+ GroupedSubset() {};
+ virtual ~GroupedSubset() {};
+ virtual SEXP get(const SlicingIndex& indices) = 0;
+ virtual SEXP get_variable() const = 0;
+ virtual bool is_summary() const = 0;
+};
+
+typedef GroupedSubset RowwiseSubset;
+
+}
+
+#endif //dplyr_GroupedSubsetBase_H
diff --git a/inst/include/dplyr/Result/ILazySubsets.h b/inst/include/dplyr/Result/ILazySubsets.h
new file mode 100644
index 0000000..9abace6
--- /dev/null
+++ b/inst/include/dplyr/Result/ILazySubsets.h
@@ -0,0 +1,34 @@
+#ifndef dplyr_ILazySubsets_H
+#define dplyr_ILazySubsets_H
+
+#include <tools/SlicingIndex.h>
+#include <tools/SymbolString.h>
+#include <tools/SymbolVector.h>
+
+namespace dplyr {
+
+class ILazySubsets {
+protected:
+ ILazySubsets() {}
+
+public:
+ virtual ~ILazySubsets() {}
+
+ virtual const SymbolVector get_variable_names() const = 0;
+ virtual SEXP get_variable(const SymbolString& symbol) const = 0;
+ virtual SEXP get(const SymbolString& symbol, const SlicingIndex& indices) const = 0;
+ virtual bool is_summary(const SymbolString& symbol) const = 0;
+ virtual bool has_variable(const SymbolString& symbol) const = 0;
+ virtual void input(const SymbolString& symbol, SEXP x) = 0;
+ virtual int size() const = 0;
+ virtual int nrows() const = 0;
+
+public:
+ bool has_non_summary_variable(const SymbolString& symbol) const {
+ return has_variable(symbol) && !is_summary(symbol);
+ }
+};
+
+}
+
+#endif
diff --git a/inst/include/dplyr/Result/In.h b/inst/include/dplyr/Result/In.h
index 47a1996..1cefdcd 100644
--- a/inst/include/dplyr/Result/In.h
+++ b/inst/include/dplyr/Result/In.h
@@ -1,37 +1,40 @@
#ifndef dplyr_Result_In_H
#define dplyr_Result_In_H
+#include <tools/hash.h>
+
+#include <dplyr/Result/Mutater.h>
+
namespace dplyr {
- template <int RTYPE>
- class In : public Mutater<LGLSXP,In<RTYPE> > {
- public:
- typedef typename Rcpp::Vector<RTYPE> Vec ;
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-
- In( Vec data_, Vec table_ ) :
- data(data_),
- table(table_),
- set(table.begin(), table.end())
- {}
-
- void process_slice( LogicalVector& out, const SlicingIndex& index, const SlicingIndex& out_index){
- int n = index.size() ;
- for( int i=0; i<n; i++){
- STORAGE value = data[index[i]] ;
- if(Vec::is_na(value)){
- out[ out_index[i] ] = false ;
- } else {
- out[ out_index[i] ] = set.count(value) ;
- }
- }
- }
-
- private:
- Vec data, table ;
- dplyr_hash_set<STORAGE> set ;
-
- } ;
+template <int RTYPE>
+class In : public Mutater<LGLSXP, In<RTYPE> > {
+public:
+ typedef typename Rcpp::Vector<RTYPE> Vec;
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+
+ In(Vec data_, const Vec& table_) :
+ data(data_),
+ set(table_.begin(), table_.end())
+ {}
+
+ void process_slice(LogicalVector& out, const SlicingIndex& index, const SlicingIndex& out_index) {
+ int n = index.size();
+ for (int i = 0; i < n; i++) {
+ STORAGE value = data[index[i]];
+ if (Vec::is_na(value)) {
+ out[ out_index[i] ] = false;
+ } else {
+ out[ out_index[i] ] = set.count(value);
+ }
+ }
+ }
+
+private:
+ Vec data;
+ dplyr_hash_set<STORAGE> set;
+
+};
}
diff --git a/inst/include/dplyr/Result/Lag.h b/inst/include/dplyr/Result/Lag.h
index 417b726..eeabe2b 100644
--- a/inst/include/dplyr/Result/Lag.h
+++ b/inst/include/dplyr/Result/Lag.h
@@ -1,90 +1,90 @@
#ifndef dplyr_Result_Lag_H
#define dplyr_Result_Lag_H
+#include <tools/scalar_type.h>
+#include <tools/utils.h>
+
+#include <dplyr/Result/Result.h>
+
namespace dplyr {
- template <int RTYPE>
- class Lag : public Result {
- public:
- typedef typename scalar_type<RTYPE>::type STORAGE ;
-
- Lag( SEXP data_, int n_, const RObject& def_, bool is_summary_) :
- data(data_),
- n(n_),
- def( Vector<RTYPE>::get_na() ),
- is_summary(is_summary_)
- {
- if( !Rf_isNull(def_) ){
- def = as<STORAGE>(def_) ;
- }
- }
-
- virtual SEXP process(const GroupedDataFrame& gdf ){
- int nrows = gdf.nrows() ;
- int ng = gdf.ngroups() ;
-
- Vector<RTYPE> out = no_init(nrows) ;
- if( is_summary ){
- for(int i=0; i<nrows; i++) out[i] = def ;
- } else {
- GroupedDataFrame::group_iterator git = gdf.group_begin();
- for( int i=0; i<ng; i++, ++git){
- process_slice(out, *git, *git) ;
- }
- }
- copy_most_attributes( out, data ) ;
- return out ;
- }
-
- virtual SEXP process(const RowwiseDataFrame& gdf ){
- Vector<RTYPE> out(gdf.nrows(), def ) ;
- copy_most_attributes( out, data ) ;
- return out ;
- }
-
- virtual SEXP process(const FullDataFrame& df){
- int nrows = df.nrows() ;
- Vector<RTYPE> out = no_init(nrows) ;
- SlicingIndex index = df.get_index() ;
- process_slice( out, index, index );
- copy_most_attributes( out, data ) ;
- return out ;
- }
-
- virtual SEXP process(const SlicingIndex& index){
- int nrows = index.size() ;
- Vector<RTYPE> out = no_init(nrows) ;
- SlicingIndex fake(0, nrows) ;
- process_slice( out, index, fake );
- copy_most_attributes( out, data ) ;
- return out ;
- }
-
- private:
-
- void process_slice( Vector<RTYPE>& out, const SlicingIndex& index, const SlicingIndex& out_index){
- int chunk_size = index.size() ;
- int i=0 ;
-
- if( n > chunk_size ) {
- for(int i=0; i<chunk_size ; i++){
- out[out_index[i]] = def ;
- }
- } else {
- for(; i<n ; i++){
- out[out_index[i]] = def ;
- }
- for( ; i<chunk_size; i++ ){
- out[out_index[i]] = data[index[i-n]] ;
- }
- }
- }
-
- Vector<RTYPE> data ;
- int n ;
- STORAGE def ;
- bool is_summary ;
- } ;
+template <int RTYPE>
+class Lag : public Result {
+public:
+ typedef typename traits::scalar_type<RTYPE>::type STORAGE;
+
+ Lag(SEXP data_, int n_, const RObject& def_, bool is_summary_) :
+ data(data_),
+ n(n_),
+ def(Vector<RTYPE>::get_na()),
+ is_summary(is_summary_)
+ {
+ if (!Rf_isNull(def_)) {
+ def = as<STORAGE>(def_);
+ }
+ }
+
+ virtual SEXP process(const GroupedDataFrame& gdf) {
+ int nrows = gdf.nrows();
+ int ng = gdf.ngroups();
+
+ Vector<RTYPE> out = no_init(nrows);
+ if (is_summary) {
+ for (int i = 0; i < nrows; i++) out[i] = def;
+ } else {
+ GroupedDataFrame::group_iterator git = gdf.group_begin();
+ for (int i = 0; i < ng; i++, ++git) {
+ process_slice(out, *git, *git);
+ }
+ }
+ copy_most_attributes(out, data);
+ return out;
+ }
+
+ virtual SEXP process(const RowwiseDataFrame& gdf) {
+ Vector<RTYPE> out(gdf.nrows(), def);
+ copy_most_attributes(out, data);
+ return out;
+ }
+
+ virtual SEXP process(const FullDataFrame& df) {
+ int nrows = df.nrows();
+ Vector<RTYPE> out = no_init(nrows);
+ const SlicingIndex& index = df.get_index();
+ process_slice(out, index, index);
+ copy_most_attributes(out, data);
+ return out;
+ }
+
+ virtual SEXP process(const SlicingIndex& index) {
+ int nrows = index.size();
+ Vector<RTYPE> out = no_init(nrows);
+ NaturalSlicingIndex fake(nrows);
+ process_slice(out, index, fake);
+ copy_most_attributes(out, data);
+ return out;
+ }
+
+private:
+
+ void process_slice(Vector<RTYPE>& out, const SlicingIndex& index, const SlicingIndex& out_index) {
+ int chunk_size = index.size();
+ int n_def = std::min(chunk_size, n);
+
+ int i = 0;
+ for (; i < n_def; ++i) {
+ out[out_index[i]] = def;
+ }
+ for (; i < chunk_size; ++i) {
+ out[out_index[i]] = data[index[i - n]];
+ }
+ }
+
+ Vector<RTYPE> data;
+ int n;
+ STORAGE def;
+ bool is_summary;
+};
}
diff --git a/inst/include/dplyr/Result/LazyGroupedSubsets.h b/inst/include/dplyr/Result/LazyGroupedSubsets.h
index 432cfd3..59cd5ec 100644
--- a/inst/include/dplyr/Result/LazyGroupedSubsets.h
+++ b/inst/include/dplyr/Result/LazyGroupedSubsets.h
@@ -1,104 +1,126 @@
#ifndef dplyr_LazyGroupedSubsets_H
#define dplyr_LazyGroupedSubsets_H
+#include <tools/SymbolMap.h>
+
+#include <dplyr/GroupedDataFrame.h>
+#include <dplyr/SummarisedVariable.h>
+
+#include <dplyr/Result/GroupedSubset.h>
+#include <dplyr/Result/ILazySubsets.h>
+
namespace dplyr {
- class LazyGroupedSubsets : public LazySubsets {
- public:
- LazyGroupedSubsets( const GroupedDataFrame& gdf_ ) :
- LazySubsets(gdf_.data()),
- gdf(gdf_),
- symbol_map(),
- subsets(),
- resolved(),
- owner(true)
- {
- int max_size = gdf.max_group_size() ;
- const DataFrame& data = gdf.data() ;
- CharacterVector names = data.names() ;
- int n = data.size() ;
- for( int i=0; i<n; i++){
- input_subset( names[i], grouped_subset( data[i], max_size ) ) ;
- }
- }
-
- LazyGroupedSubsets( const LazyGroupedSubsets& other) :
- LazySubsets(other.gdf.data()),
- gdf(other.gdf),
- symbol_map(other.symbol_map),
- subsets(other.subsets),
- resolved(other.resolved),
- owner(false)
- {}
-
- void clear(){
- for( size_t i=0; i<resolved.size(); i++){
- resolved[i] = R_NilValue ;
- }
- }
-
- int count(SEXP head) const {
- int res = symbol_map.has(head);
- return res ;
- }
-
- virtual int size() const {
- return subsets.size();
- }
-
- SEXP get_variable( SEXP symbol ) const {
- return subsets[symbol_map.get(symbol)]->get_variable() ;
- }
- bool is_summary( SEXP symbol ) const {
- return subsets[symbol_map.get(symbol)]->is_summary() ;
- }
- SEXP get( SEXP symbol, const SlicingIndex& indices ){
- int idx = symbol_map.get(symbol) ;
-
- SEXP value = resolved[idx] ;
- if( value == R_NilValue ){
- resolved[idx] = value = subsets[idx]->get(indices) ;
- }
- return value ;
- }
-
- ~LazyGroupedSubsets(){
- if(owner) {
- for( size_t i=0; i<subsets.size(); i++){
- delete subsets[i] ;
- }
- }
- }
-
- void input(SEXP symbol, SEXP x){
- input_subset( symbol, grouped_subset(x, gdf.max_group_size() ) );
- }
-
- void input(SEXP symbol, SummarisedVariable x){
- input_subset( symbol, summarised_grouped_subset(x, gdf.max_group_size() ) ) ;
- }
-
- private:
- const GroupedDataFrame& gdf ;
- SymbolMap symbol_map ;
- std::vector<GroupedSubset*> subsets ;
- std::vector<SEXP> resolved ;
-
- bool owner ;
-
- void input_subset(SEXP symbol, GroupedSubset* sub){
- SymbolMapIndex index = symbol_map.insert(symbol) ;
- if( index.origin == NEW ){
- subsets.push_back(sub) ;
- resolved.push_back(R_NilValue) ;
- } else {
- int idx = index.pos ;
- delete subsets[idx] ;
- subsets[idx] = sub ;
- resolved[idx] = R_NilValue ;
- }
- }
- } ;
+template <class Data>
+class LazySplitSubsets : public ILazySubsets {
+ typedef typename Data::subset subset;
+
+public:
+ LazySplitSubsets(const Data& gdf_) :
+ gdf(gdf_),
+ subsets(),
+ symbol_map(),
+ resolved(),
+ owner(true)
+ {
+ const DataFrame& data = gdf.data();
+ CharacterVector names = data.names();
+ int n = data.size();
+ LOG_INFO << "processing " << n << " vars: " << names;
+ for (int i = 0; i < n; i++) {
+ input(names[i], data[i]);
+ }
+ }
+
+ LazySplitSubsets(const LazySplitSubsets& other) :
+ gdf(other.gdf),
+ subsets(other.subsets),
+ symbol_map(other.symbol_map),
+ resolved(other.resolved),
+ owner(false)
+ {}
+
+ virtual ~LazySplitSubsets() {
+ if (owner) {
+ for (size_t i = 0; i < subsets.size(); i++) {
+ delete subsets[i];
+ }
+ }
+ }
+
+public:
+ virtual const SymbolVector get_variable_names() const {
+ return symbol_map.get_names();
+ }
+
+ virtual SEXP get_variable(const SymbolString& symbol) const {
+ return subsets[symbol_map.get(symbol)]->get_variable();
+ }
+
+ virtual SEXP get(const SymbolString& symbol, const SlicingIndex& indices) const {
+ int idx = symbol_map.get(symbol);
+
+ SEXP value = resolved[idx];
+ if (value == R_NilValue) {
+ resolved[idx] = value = subsets[idx]->get(indices);
+ }
+ return value;
+ }
+
+ virtual bool is_summary(const SymbolString& symbol) const {
+ return subsets[symbol_map.get(symbol)]->is_summary();
+ }
+
+ virtual bool has_variable(const SymbolString& head) const {
+ return symbol_map.has(head);
+ }
+
+ virtual void input(const SymbolString& symbol, SEXP x) {
+ input_subset(symbol, gdf.create_subset(x));
+ }
+
+ virtual int size() const {
+ return subsets.size();
+ }
+
+ virtual int nrows() const {
+ return gdf.nrows();
+ }
+
+public:
+ void clear() {
+ for (size_t i = 0; i < resolved.size(); i++) {
+ resolved[i] = R_NilValue;
+ }
+ }
+
+ void input_summarised(const SymbolString& symbol, SummarisedVariable x) {
+ input_subset(symbol, summarised_subset(x));
+ }
+
+private:
+ const Data& gdf;
+ std::vector<subset*> subsets;
+ SymbolMap symbol_map;
+ mutable std::vector<SEXP> resolved;
+
+ bool owner;
+
+ void input_subset(const SymbolString& symbol, subset* sub) {
+ SymbolMapIndex index = symbol_map.insert(symbol);
+ if (index.origin == NEW) {
+ subsets.push_back(sub);
+ resolved.push_back(R_NilValue);
+ } else {
+ int idx = index.pos;
+ delete subsets[idx];
+ subsets[idx] = sub;
+ resolved[idx] = R_NilValue;
+ }
+ }
+};
+
+typedef LazySplitSubsets<GroupedDataFrame> LazyGroupedSubsets;
}
#endif
diff --git a/inst/include/dplyr/Result/LazyRowwiseSubsets.h b/inst/include/dplyr/Result/LazyRowwiseSubsets.h
index b7daca3..50e91b2 100644
--- a/inst/include/dplyr/Result/LazyRowwiseSubsets.h
+++ b/inst/include/dplyr/Result/LazyRowwiseSubsets.h
@@ -1,92 +1,12 @@
#ifndef dplyr_LazyRowwiseSubsets_H
#define dplyr_LazyRowwiseSubsets_H
-namespace dplyr {
-
- class LazyRowwiseSubsets : public LazySubsets {
- public:
- typedef dplyr_hash_map<SEXP, RowwiseSubset*> RowwiseSubsetMap ;
- typedef dplyr_hash_map<SEXP, SEXP> ResolvedSubsetMap ;
-
- LazyRowwiseSubsets( const RowwiseDataFrame& rdf_ ):
- LazySubsets(rdf_.data()), rdf(rdf_), subset_map(), resolved_map(), owner(true)
- {
- const DataFrame& data = rdf.data() ;
- CharacterVector names = data.names() ;
- int n = data.size() ;
- for( int i=0; i<n; i++){
- subset_map[ Rf_installChar( names[i] ) ] = rowwise_subset( data[i] );
- }
- }
-
- LazyRowwiseSubsets( const LazyRowwiseSubsets& other) :
- LazySubsets(other.rdf.data()), rdf(other.rdf), subset_map(other.subset_map), resolved_map(other.resolved_map), owner(false)
- {}
-
- void clear(){
- resolved_map.clear() ;
- }
-
- int count(SEXP head) const {
- return subset_map.count(head);
- }
- virtual int size() const {
- return subset_map.size();
- }
-
- SEXP get_variable( SEXP symbol ) const {
- RowwiseSubsetMap::const_iterator it = subset_map.find( symbol );
- if( it == subset_map.end() ){
- stop( "variable '%s' not found in the dataset", CHAR(PRINTNAME(symbol)) ) ;
- }
- return it->second->get_variable() ;
- }
- bool is_summary( SEXP symbol ) const {
- RowwiseSubsetMap::const_iterator it = subset_map.find( symbol );
- return it->second->is_summary() ;
- }
- SEXP get( SEXP symbol, const SlicingIndex& indices ){
- ResolvedSubsetMap::const_iterator it = resolved_map.find( symbol ) ;
- if( it == resolved_map.end() ){
- SEXP res = subset_map[symbol]->get( indices ) ;
- resolved_map[symbol] = res ;
- return res ;
- } else {
- return it->second ;
- }
- }
-
- ~LazyRowwiseSubsets(){
- if(owner) delete_all_second( subset_map ) ;
- }
-
- void input(SEXP symbol, SEXP x){
- if( TYPEOF(symbol) == SYMSXP ){
- input_subset( symbol, rowwise_subset(x) );
- } else {
- input_subset( Rf_installChar(symbol), rowwise_subset(x) );
- }
- }
-
- private:
- const RowwiseDataFrame& rdf ;
- RowwiseSubsetMap subset_map ;
- ResolvedSubsetMap resolved_map ;
- bool owner ;
-
- void input_subset(SEXP symbol, RowwiseSubset* sub){
- RowwiseSubsetMap::iterator it = subset_map.find(symbol) ;
- if( it == subset_map.end() ){
- subset_map[symbol] = sub ;
- } else {
- // found it, replacing the subset
- delete it->second ;
- it->second = sub ;
- }
- }
- } ;
+#include <dplyr/Result/LazyGroupedSubsets.h>
+#include <dplyr/RowwiseDataFrame.h>
+namespace dplyr {
+typedef LazySplitSubsets<RowwiseDataFrame> LazyRowwiseSubsets;
}
#endif
diff --git a/inst/include/dplyr/Result/LazySubsets.h b/inst/include/dplyr/Result/LazySubsets.h
index bd56fc4..9f1bb80 100644
--- a/inst/include/dplyr/Result/LazySubsets.h
+++ b/inst/include/dplyr/Result/LazySubsets.h
@@ -1,62 +1,90 @@
#ifndef dplyr_LazySubsets_H
#define dplyr_LazySubsets_H
+#include <tools/SymbolMap.h>
+#include <tools/SlicingIndex.h>
+#include <dplyr/Result/ILazySubsets.h>
+
namespace dplyr {
- class LazySubsets {
- public:
- SymbolMap symbol_map ;
- std::vector<SEXP> data ;
- int nr ;
-
- LazySubsets( const DataFrame& df) : nr(df.nrows()){
- int nvars = df.size() ;
- if( nvars ){
- CharacterVector names = df.names() ;
- for( int i=0; i<nvars; i++){
- SEXP column = df[i] ;
- if( Rf_inherits( column, "matrix" ) ){
- stop( "matrix as column is not supported" ) ;
- }
- symbol_map.insert( names[i] ) ;
- data.push_back( df[i] ) ;
- }
- }
+class LazySubsets : public ILazySubsets {
+public:
+ LazySubsets(const DataFrame& df) : nr(df.nrows()) {
+ int nvars = df.size();
+ if (nvars) {
+ CharacterVector names = df.names();
+ for (int i = 0; i < nvars; i++) {
+ SEXP column = df[i];
+ if (Rf_inherits(column, "matrix")) {
+ stop("matrix as column is not supported");
}
- virtual ~LazySubsets(){}
+ symbol_map.insert(names[i]);
+ data.push_back(df[i]);
+ }
+ }
+ }
+ virtual ~LazySubsets() {}
- virtual SEXP get_variable(SEXP symbol) const {
- return data[ symbol_map.get(symbol) ] ;
- }
- virtual bool is_summary( SEXP symbol ) const {
- return false ;
- }
- virtual int count(SEXP symbol) const{
- int res = symbol_map.has(symbol);
- return res ;
- }
+public:
+ virtual const SymbolVector get_variable_names() const {
+ return symbol_map.get_names();
+ }
- virtual void input( SEXP symbol, SEXP x){
- SymbolMapIndex index = symbol_map.insert(symbol) ;
- if( index.origin == NEW ){
- data.push_back(x) ;
- } else {
- data[index.pos] = x ;
- }
- }
+ virtual SEXP get_variable(const SymbolString& symbol) const {
+ return data[ symbol_map.get(symbol) ];
+ }
- virtual int size() const{
- return data.size() ;
- }
+ virtual SEXP get(const SymbolString& symbol, const SlicingIndex& indices) const {
+ const int pos = symbol_map.get(symbol);
+ SEXP col = data[pos];
+ if (!indices.is_identity(col) && Rf_length(col) != 1)
+ stop("Attempt to query lazy column with non-natural slicing index");
- inline int nrows() const {
- return nr ;
- }
+ return col;
+ }
- inline SEXP& operator[](SEXP symbol){
- return data[symbol_map.get(symbol)] ;
- }
- } ;
+ virtual bool is_summary(const SymbolString& symbol) const {
+ return summary_map.has(symbol);
+ }
+
+ virtual bool has_variable(const SymbolString& symbol) const {
+ return symbol_map.has(symbol);
+ }
+
+ virtual void input(const SymbolString& symbol, SEXP x) {
+ SymbolMapIndex index = symbol_map.insert(symbol);
+ if (index.origin == NEW) {
+ data.push_back(x);
+ } else {
+ data[index.pos] = x;
+ }
+ }
+
+ virtual int size() const {
+ return data.size();
+ }
+
+ virtual int nrows() const {
+ return nr;
+ }
+
+ void input_summarised(const SymbolString& symbol, SummarisedVariable x) {
+ input(symbol, x);
+ summary_map.insert(symbol);
+ }
+
+public:
+ void clear() {}
+
+ inline SEXP& operator[](const SymbolString& symbol) {
+ return data[symbol_map.get(symbol)];
+ }
+
+private:
+ SymbolMap symbol_map, summary_map;
+ std::vector<SEXP> data;
+ int nr;
+};
}
diff --git a/inst/include/dplyr/Result/Lead.h b/inst/include/dplyr/Result/Lead.h
index b2d826f..695d760 100644
--- a/inst/include/dplyr/Result/Lead.h
+++ b/inst/include/dplyr/Result/Lead.h
@@ -1,94 +1,90 @@
#ifndef dplyr_Result_Lead_H
#define dplyr_Result_Lead_H
+#include <tools/scalar_type.h>
+#include <tools/utils.h>
+
+#include <dplyr/Result/Result.h>
+
namespace dplyr {
- template <int RTYPE>
- struct scalar_type {
- typedef typename traits::storage_type<RTYPE>::type type ;
- } ;
- template <>
- struct scalar_type<STRSXP> {
- typedef String type ;
- } ;
-
- template <int RTYPE>
- class Lead : public Result {
- public:
- typedef typename scalar_type<RTYPE>::type STORAGE ;
-
- Lead( SEXP data_, int n_, const RObject& def_, bool is_summary_) :
- data(data_),
- n(n_),
- def(Vector<RTYPE>::get_na()),
- is_summary(is_summary_)
- {
- if( !Rf_isNull(def_)){
- def = as<STORAGE>( def_ ) ;
- }
- }
-
- virtual SEXP process(const GroupedDataFrame& gdf ){
- int nrows = gdf.nrows() ;
- int ng = gdf.ngroups() ;
-
- Vector<RTYPE> out = no_init(nrows) ;
- if( is_summary ){
- for(int i=0; i<nrows; i++) out[i] = def ;
- } else {
- GroupedDataFrame::group_iterator git = gdf.group_begin();
- for( int i=0; i<ng; i++, ++git){
- process_slice(out, *git, *git) ;
- }
- }
- copy_most_attributes( out, data ) ;
- return out ;
- }
-
- virtual SEXP process(const RowwiseDataFrame& gdf ){
- int nrows = gdf.nrows() ;
-
- Vector<RTYPE> out(nrows, def ) ;
- copy_most_attributes( out, data ) ;
- return out ;
- }
-
- virtual SEXP process(const FullDataFrame& df){
- int nrows = df.nrows() ;
- Vector<RTYPE> out = no_init(nrows) ;
- SlicingIndex index = df.get_index() ;
- process_slice( out, index, index );
- copy_most_attributes( out, data ) ;
- return out ;
- }
-
- virtual SEXP process(const SlicingIndex& index){
- int nrows = index.size() ;
- Vector<RTYPE> out = no_init(nrows) ;
- SlicingIndex fake(0, nrows) ;
- process_slice( out, index, fake );
- copy_most_attributes( out, data ) ;
- return out ;
- }
-
- private:
-
- void process_slice( Vector<RTYPE>& out, const SlicingIndex& index, const SlicingIndex& out_index){
- int chunk_size = index.size() ;
- int i=0 ;
- for( ; i<chunk_size-n; i++ ){
- out[out_index[i]] = data[index[i+n]] ;
- }
- for(; i<chunk_size; i++){
- out[out_index[i]] = def ;
- }
- }
-
- Vector<RTYPE> data ;
- int n ;
- STORAGE def ;
- bool is_summary ;
- } ;
+template <int RTYPE>
+class Lead : public Result {
+public:
+ typedef typename traits::scalar_type<RTYPE>::type STORAGE;
+
+ Lead(SEXP data_, int n_, const RObject& def_, bool is_summary_) :
+ data(data_),
+ n(n_),
+ def(Vector<RTYPE>::get_na()),
+ is_summary(is_summary_)
+ {
+ if (!Rf_isNull(def_)) {
+ def = as<STORAGE>(def_);
+ }
+ }
+
+ virtual SEXP process(const GroupedDataFrame& gdf) {
+ int nrows = gdf.nrows();
+ int ng = gdf.ngroups();
+
+ Vector<RTYPE> out = no_init(nrows);
+ if (is_summary) {
+ for (int i = 0; i < nrows; i++) out[i] = def;
+ } else {
+ GroupedDataFrame::group_iterator git = gdf.group_begin();
+ for (int i = 0; i < ng; i++, ++git) {
+ process_slice(out, *git, *git);
+ }
+ }
+ copy_most_attributes(out, data);
+ return out;
+ }
+
+ virtual SEXP process(const RowwiseDataFrame& gdf) {
+ int nrows = gdf.nrows();
+
+ Vector<RTYPE> out(nrows, def);
+ copy_most_attributes(out, data);
+ return out;
+ }
+
+ virtual SEXP process(const FullDataFrame& df) {
+ int nrows = df.nrows();
+ Vector<RTYPE> out = no_init(nrows);
+ const SlicingIndex& index = df.get_index();
+ process_slice(out, index, index);
+ copy_most_attributes(out, data);
+ return out;
+ }
+
+ virtual SEXP process(const SlicingIndex& index) {
+ int nrows = index.size();
+ Vector<RTYPE> out = no_init(nrows);
+ NaturalSlicingIndex fake(nrows);
+ process_slice(out, index, fake);
+ copy_most_attributes(out, data);
+ return out;
+ }
+
+private:
+
+ void process_slice(Vector<RTYPE>& out, const SlicingIndex& index, const SlicingIndex& out_index) {
+ int chunk_size = index.size();
+ int i = 0;
+ for (; i < chunk_size - n; i++) {
+ out[out_index[i]] = data[index[i + n]];
+ }
+ for (; i < chunk_size; i++) {
+ out[out_index[i]] = def;
+ }
+ }
+
+ Vector<RTYPE> data;
+ int n;
+ STORAGE def;
+ bool is_summary;
+};
}
diff --git a/inst/include/dplyr/Result/Mean.h b/inst/include/dplyr/Result/Mean.h
index d805459..5de1c12 100644
--- a/inst/include/dplyr/Result/Mean.h
+++ b/inst/include/dplyr/Result/Mean.h
@@ -1,114 +1,116 @@
#ifndef dplyr_Result_Mean_H
#define dplyr_Result_Mean_H
+#include <dplyr/Result/Processor.h>
+
namespace dplyr {
namespace internal {
- // version for NA_RM == true
- template <int RTYPE, bool NA_RM, typename Index>
- struct Mean_internal{
- static double process(typename Rcpp::traits::storage_type<RTYPE>::type* ptr, const Index& indices){
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
- long double res = 0.0 ;
- int n = indices.size() ;
- int m = 0;
- for( int i=0; i<n; i++){
- STORAGE value = ptr[ indices[i] ] ;
- if( ! Rcpp::traits::is_na<RTYPE>( value ) ){
- res += value ;
- m++ ;
- }
- }
- if( m == 0 ) return R_NaN ;
- res /= m ;
-
- if(R_FINITE(res)) {
- long double t = 0.0 ;
- for (int i = 0; i<n; i++) {
- STORAGE value = ptr[indices[i]] ;
- if( ! Rcpp::traits::is_na<RTYPE>( value ) ){
- t += value - res;
- }
- }
- res += t/m;
- }
-
- return (double)res ;
- }
- } ;
-
- // special cases for NA_RM == false
- template <typename Index>
- struct Mean_internal<INTSXP,false,Index>{
- static double process( int* ptr, const Index& indices ){
- long double res = 0.0 ;
- int n = indices.size() ;
- for( int i=0; i<n; i++){
- int value = ptr[ indices[i] ] ;
- // need to handle missing value specifically
- if( value == NA_INTEGER ){
- return NA_REAL ;
- }
- res += value ;
- }
- res /= n ;
-
- if(R_FINITE((double)res)) {
- long double t = 0.0 ;
- for (int i = 0; i<n; i++) {
- t += ptr[indices[i]] - res;
- }
- res += t/n;
- }
- return (double)res ;
- }
- } ;
-
- template <typename Index>
- struct Mean_internal<REALSXP,false,Index>{
- static double process( double* ptr, const Index& indices ){
- long double res = 0.0 ;
- int n = indices.size() ;
- for( int i=0; i<n; i++){
- res += ptr[ indices[i] ] ;
- }
- res /= n ;
-
- if(R_FINITE((double)res)) {
- long double t = 0.0 ;
- for (int i = 0; i<n; i++) {
- t += ptr[indices[i]] - res;
- }
- res += t/n;
- }
- return (double)res ;
+// version for NA_RM == true
+template <int RTYPE, bool NA_RM, typename Index>
+struct Mean_internal {
+ static double process(typename Rcpp::traits::storage_type<RTYPE>::type* ptr, const Index& indices) {
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+ long double res = 0.0;
+ int n = indices.size();
+ int m = 0;
+ for (int i = 0; i < n; i++) {
+ STORAGE value = ptr[ indices[i] ];
+ if (! Rcpp::traits::is_na<RTYPE>(value)) {
+ res += value;
+ m++;
+ }
+ }
+ if (m == 0) return R_NaN;
+ res /= m;
+
+ if (R_FINITE(res)) {
+ long double t = 0.0;
+ for (int i = 0; i < n; i++) {
+ STORAGE value = ptr[indices[i]];
+ if (! Rcpp::traits::is_na<RTYPE>(value)) {
+ t += value - res;
}
- } ;
+ }
+ res += t / m;
+ }
+
+ return (double)res;
+ }
+};
+
+// special cases for NA_RM == false
+template <typename Index>
+struct Mean_internal<INTSXP, false, Index> {
+ static double process(int* ptr, const Index& indices) {
+ long double res = 0.0;
+ int n = indices.size();
+ for (int i = 0; i < n; i++) {
+ int value = ptr[ indices[i] ];
+ // need to handle missing value specifically
+ if (value == NA_INTEGER) {
+ return NA_REAL;
+ }
+ res += value;
+ }
+ res /= n;
+
+ if (R_FINITE((double)res)) {
+ long double t = 0.0;
+ for (int i = 0; i < n; i++) {
+ t += ptr[indices[i]] - res;
+ }
+ res += t / n;
+ }
+ return (double)res;
+ }
+};
+
+template <typename Index>
+struct Mean_internal<REALSXP, false, Index> {
+ static double process(double* ptr, const Index& indices) {
+ long double res = 0.0;
+ int n = indices.size();
+ for (int i = 0; i < n; i++) {
+ res += ptr[ indices[i] ];
+ }
+ res /= n;
+
+ if (R_FINITE((double)res)) {
+ long double t = 0.0;
+ for (int i = 0; i < n; i++) {
+ t += ptr[indices[i]] - res;
+ }
+ res += t / n;
+ }
+ return (double)res;
+ }
+};
} // namespace internal
- template <int RTYPE, bool NA_RM>
- class Mean : public Processor< REALSXP, Mean<RTYPE,NA_RM> > {
- public:
- typedef Processor< REALSXP, Mean<RTYPE,NA_RM> > Base ;
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-
- Mean(SEXP x, bool is_summary_ = false) :
- Base(x),
- data_ptr( Rcpp::internal::r_vector_start<RTYPE>(x)),
- is_summary(is_summary_)
- {}
- ~Mean(){}
-
- inline double process_chunk( const SlicingIndex& indices ){
- if( is_summary ) return data_ptr[indices.group()] ;
- return internal::Mean_internal<RTYPE,NA_RM,SlicingIndex>::process(data_ptr, indices) ;
- }
+template <int RTYPE, bool NA_RM>
+class Mean : public Processor< REALSXP, Mean<RTYPE, NA_RM> > {
+public:
+ typedef Processor< REALSXP, Mean<RTYPE, NA_RM> > Base;
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+
+ Mean(SEXP x, bool is_summary_ = false) :
+ Base(x),
+ data_ptr(Rcpp::internal::r_vector_start<RTYPE>(x)),
+ is_summary(is_summary_)
+ {}
+ ~Mean() {}
+
+ inline double process_chunk(const SlicingIndex& indices) {
+ if (is_summary) return data_ptr[indices.group()];
+ return internal::Mean_internal<RTYPE, NA_RM, SlicingIndex>::process(data_ptr, indices);
+ }
- private:
- STORAGE* data_ptr ;
- bool is_summary ;
- } ;
+private:
+ STORAGE* data_ptr;
+ bool is_summary;
+};
}
diff --git a/inst/include/dplyr/Result/MinMax.h b/inst/include/dplyr/Result/MinMax.h
new file mode 100644
index 0000000..8a7e934
--- /dev/null
+++ b/inst/include/dplyr/Result/MinMax.h
@@ -0,0 +1,69 @@
+#ifndef dplyr_Result_MinMax_H
+#define dplyr_Result_MinMax_H
+
+#include <dplyr/Result/is_smaller.h>
+#include <dplyr/Result/Processor.h>
+
+namespace dplyr {
+
+template <int RTYPE, bool MINIMUM, bool NA_RM>
+class MinMax : public Processor<REALSXP, MinMax<RTYPE, MINIMUM, NA_RM> > {
+
+public:
+ typedef Processor<REALSXP, MinMax<RTYPE, MINIMUM, NA_RM> > Base;
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+
+private:
+ static const double Inf;
+
+public:
+ MinMax(SEXP x, bool is_summary_ = false) :
+ Base(x),
+ data_ptr(Rcpp::internal::r_vector_start<RTYPE>(x)),
+ is_summary(is_summary_)
+ {}
+ ~MinMax() {}
+
+ double process_chunk(const SlicingIndex& indices) {
+ if (is_summary) return data_ptr[ indices.group() ];
+
+ const int n = indices.size();
+ double res = Inf;
+
+ for (int i = 0; i < n; ++i) {
+ STORAGE current = data_ptr[indices[i]];
+
+ if (Rcpp::Vector<RTYPE>::is_na(current)) {
+ if (NA_RM)
+ continue;
+ else
+ return NA_REAL;
+ }
+ else {
+ double current_res = current;
+ if (is_better(current_res, res))
+ res = current_res;
+ }
+ }
+
+ return res;
+ }
+
+ inline static bool is_better(const double current, const double res) {
+ if (MINIMUM)
+ return internal::is_smaller<REALSXP>(current, res);
+ else
+ return internal::is_smaller<REALSXP>(res, current);
+ }
+
+private:
+ STORAGE* data_ptr;
+ bool is_summary;
+};
+
+template <int RTYPE, bool MINIMUM, bool NA_RM>
+const double MinMax<RTYPE, MINIMUM, NA_RM>::Inf = (MINIMUM ? R_PosInf : R_NegInf);
+
+}
+
+#endif
diff --git a/inst/include/dplyr/Result/Mutater.h b/inst/include/dplyr/Result/Mutater.h
index 310fa7b..cb430ce 100644
--- a/inst/include/dplyr/Result/Mutater.h
+++ b/inst/include/dplyr/Result/Mutater.h
@@ -1,50 +1,52 @@
#ifndef dplyr_Result_Mutater_H
#define dplyr_Result_Mutater_H
+#include <dplyr/Result/Result.h>
+
namespace dplyr {
- template <int RTYPE, typename Derived>
- class Mutater : public Result {
- public:
-
- virtual SEXP process(const GroupedDataFrame& gdf ){
- int ng = gdf.ngroups() ;
-
- Vector<RTYPE> out = no_init(gdf.nrows()) ;
- GroupedDataFrame::group_iterator git = gdf.group_begin();
- for( int i=0; i<ng; i++, ++git){
- static_cast<Derived&>(*this).process_slice(out, *git, *git) ;
- }
- return out ;
- }
-
- virtual SEXP process(const RowwiseDataFrame& gdf ){
- int ng = gdf.ngroups() ;
-
- Vector<RTYPE> out = no_init(gdf.nrows()) ;
- RowwiseDataFrame::group_iterator git = gdf.group_begin();
- for( int i=0; i<ng; i++, ++git){
- static_cast<Derived&>(*this).process_slice(out, *git, *git) ;
- }
- return out ;
- }
-
- virtual SEXP process(const FullDataFrame& df){
- Vector<RTYPE> out = no_init(df.nrows()) ;
- SlicingIndex index = df.get_index() ;
- static_cast<Derived&>(*this).process_slice( out, index, index );
- return out ;
- }
-
- virtual SEXP process(const SlicingIndex& index){
- int nrows = index.size() ;
- Vector<RTYPE> out = no_init(nrows) ;
- SlicingIndex fake(0, nrows) ;
- static_cast<Derived&>(*this).process_slice( out, index, fake );
- return out ;
- }
-
- } ;
+template <int RTYPE, typename Derived>
+class Mutater : public Result {
+public:
+
+ virtual SEXP process(const GroupedDataFrame& gdf) {
+ int ng = gdf.ngroups();
+
+ Vector<RTYPE> out = no_init(gdf.nrows());
+ GroupedDataFrame::group_iterator git = gdf.group_begin();
+ for (int i = 0; i < ng; i++, ++git) {
+ static_cast<Derived&>(*this).process_slice(out, *git, *git);
+ }
+ return out;
+ }
+
+ virtual SEXP process(const RowwiseDataFrame& gdf) {
+ int ng = gdf.ngroups();
+
+ Vector<RTYPE> out = no_init(gdf.nrows());
+ RowwiseDataFrame::group_iterator git = gdf.group_begin();
+ for (int i = 0; i < ng; i++, ++git) {
+ static_cast<Derived&>(*this).process_slice(out, *git, *git);
+ }
+ return out;
+ }
+
+ virtual SEXP process(const FullDataFrame& df) {
+ Vector<RTYPE> out = no_init(df.nrows());
+ const SlicingIndex& index = df.get_index();
+ static_cast<Derived&>(*this).process_slice(out, index, index);
+ return out;
+ }
+
+ virtual SEXP process(const SlicingIndex& index) {
+ int nrows = index.size();
+ Vector<RTYPE> out = no_init(nrows);
+ NaturalSlicingIndex fake(nrows);
+ static_cast<Derived&>(*this).process_slice(out, index, fake);
+ return out;
+ }
+
+};
}
diff --git a/inst/include/dplyr/Result/Processor.h b/inst/include/dplyr/Result/Processor.h
index 46f5603..3cb8150 100644
--- a/inst/include/dplyr/Result/Processor.h
+++ b/inst/include/dplyr/Result/Processor.h
@@ -1,104 +1,111 @@
#ifndef dplyr_Result_Processor_H
#define dplyr_Result_Processor_H
-namespace dplyr{
-
- // if we derive from this instead of deriving from Result, all we have to
- // do is implement a process_chunk method that takes a SlicingIndex& as
- // input and returns the suitable type (i.e. storage_type<OUTPUT>)
- // all the builtin result implementation (Mean, ...) use this.
- template <int OUTPUT, typename CLASS>
- class Processor : public Result {
- public:
- typedef typename Rcpp::traits::storage_type<OUTPUT>::type STORAGE ;
-
- Processor() : data(R_NilValue) {}
-
- Processor(SEXP data_) : data(data_) {}
-
- virtual SEXP process(const Rcpp::GroupedDataFrame& gdf ) {
- return process_grouped<GroupedDataFrame>( gdf ) ;
- }
-
- virtual SEXP process(const Rcpp::RowwiseDataFrame& gdf ) {
- return process_grouped<RowwiseDataFrame>( gdf ) ;
- }
-
- virtual SEXP process( const Rcpp::FullDataFrame& df){
- return promote(process( df.get_index() )) ;
- }
-
- virtual SEXP process( const SlicingIndex& index){
- CLASS* obj = static_cast<CLASS*>(this) ;
- Rcpp::Vector<OUTPUT> res = Rcpp::Vector<OUTPUT>::create( obj->process_chunk(index) );
- copy_attributes(res, data) ;
- return res ;
- }
-
- private:
-
- template <typename Data>
- SEXP process_grouped(const Data& gdf ) {
- int n = gdf.ngroups() ;
- Rcpp::Shield<SEXP> res( Rf_allocVector( OUTPUT, n) );
- STORAGE* ptr = Rcpp::internal::r_vector_start<OUTPUT>(res) ;
- CLASS* obj = static_cast<CLASS*>(this) ;
- typename Data::group_iterator git = gdf.group_begin();
- for( int i=0; i<n; i++, ++git)
- ptr[i] = obj->process_chunk(*git) ;
- copy_attributes(res, data) ;
- return res ;
- }
-
- inline SEXP promote(SEXP obj){
- RObject res(obj) ;
- copy_attributes(res, data) ;
- return res ;
- }
-
-
- SEXP data ;
-
-
-
- } ;
-
- template <typename CLASS>
- class Processor<STRSXP, CLASS> : public Result {
- public:
- Processor(SEXP data_): data(data_){}
-
- virtual SEXP process(const Rcpp::GroupedDataFrame& gdf) {
- return process_grouped<GroupedDataFrame>(gdf) ;
- }
- virtual SEXP process(const Rcpp::RowwiseDataFrame& gdf) {
- return process_grouped<RowwiseDataFrame>(gdf) ;
- }
-
- virtual SEXP process( const Rcpp::FullDataFrame& df){
- return process( df.get_index() );
- }
-
- virtual SEXP process( const SlicingIndex& index){
- CLASS* obj = static_cast<CLASS*>(this) ;
- return CharacterVector::create( obj->process_chunk(index) );
- }
-
- private:
-
- template <typename Data>
- SEXP process_grouped(const Data& gdf) {
- int n = gdf.ngroups() ;
- Rcpp::Shield<SEXP> res( Rf_allocVector( STRSXP, n) ) ;
- CLASS* obj = static_cast<CLASS*>(this) ;
- typename Data::group_iterator git = gdf.group_begin() ;
- for( int i=0; i<n; i++, ++git)
- SET_STRING_ELT( res, i, obj->process_chunk(*git) );
- return res ;
- }
-
- SEXP data ;
- } ;
+#include <tools/utils.h>
+
+#include <dplyr/GroupedDataFrame.h>
+#include <dplyr/RowwiseDataFrame.h>
+
+#include <dplyr/Result/Result.h>
+
+namespace dplyr {
+
+// if we derive from this instead of deriving from Result, all we have to
+// do is implement a process_chunk method that takes a SlicingIndex& as
+// input and returns the suitable type (i.e. storage_type<OUTPUT>)
+// all the builtin result implementation (Mean, ...) use this.
+template <int OUTPUT, typename CLASS>
+class Processor : public Result {
+public:
+ typedef typename Rcpp::traits::storage_type<OUTPUT>::type STORAGE;
+
+ Processor() : data(R_NilValue) {}
+
+ Processor(SEXP data_) : data(data_) {}
+
+ virtual SEXP process(const Rcpp::GroupedDataFrame& gdf) {
+ return process_grouped<GroupedDataFrame>(gdf);
+ }
+
+ virtual SEXP process(const Rcpp::RowwiseDataFrame& gdf) {
+ return process_grouped<RowwiseDataFrame>(gdf);
+ }
+
+ virtual SEXP process(const Rcpp::FullDataFrame& df) {
+ return promote(process(df.get_index()));
+ }
+
+ virtual SEXP process(const SlicingIndex& index) {
+ CLASS* obj = static_cast<CLASS*>(this);
+ Rcpp::Vector<OUTPUT> res = Rcpp::Vector<OUTPUT>::create(obj->process_chunk(index));
+ copy_attributes(res, data);
+ return res;
+ }
+
+private:
+
+ template <typename Data>
+ SEXP process_grouped(const Data& gdf) {
+ int n = gdf.ngroups();
+ Rcpp::Shield<SEXP> res(Rf_allocVector(OUTPUT, n));
+ STORAGE* ptr = Rcpp::internal::r_vector_start<OUTPUT>(res);
+ CLASS* obj = static_cast<CLASS*>(this);
+ typename Data::group_iterator git = gdf.group_begin();
+ for (int i = 0; i < n; i++, ++git)
+ ptr[i] = obj->process_chunk(*git);
+ copy_attributes(res, data);
+ return res;
+ }
+
+ inline SEXP promote(SEXP obj) {
+ RObject res(obj);
+ copy_attributes(res, data);
+ return res;
+ }
+
+
+ SEXP data;
+
+
+
+};
+
+template <typename CLASS>
+class Processor<STRSXP, CLASS> : public Result {
+public:
+ Processor(SEXP data_): data(data_) {}
+
+ virtual SEXP process(const Rcpp::GroupedDataFrame& gdf) {
+ return process_grouped<GroupedDataFrame>(gdf);
+ }
+ virtual SEXP process(const Rcpp::RowwiseDataFrame& gdf) {
+ return process_grouped<RowwiseDataFrame>(gdf);
+ }
+
+ virtual SEXP process(const Rcpp::FullDataFrame& df) {
+ return process(df.get_index());
+ }
+
+ virtual SEXP process(const SlicingIndex& index) {
+ CLASS* obj = static_cast<CLASS*>(this);
+ return CharacterVector::create(obj->process_chunk(index));
+ }
+
+private:
+
+ template <typename Data>
+ SEXP process_grouped(const Data& gdf) {
+ int n = gdf.ngroups();
+ Rcpp::Shield<SEXP> res(Rf_allocVector(STRSXP, n));
+ CLASS* obj = static_cast<CLASS*>(this);
+ typename Data::group_iterator git = gdf.group_begin();
+ for (int i = 0; i < n; i++, ++git)
+ SET_STRING_ELT(res, i, obj->process_chunk(*git));
+ return res;
+ }
+
+ SEXP data;
+};
}
#endif
diff --git a/inst/include/dplyr/Result/Rank.h b/inst/include/dplyr/Result/Rank.h
index f441ec6..42631d2 100644
--- a/inst/include/dplyr/Result/Rank.h
+++ b/inst/include/dplyr/Result/Rank.h
@@ -1,419 +1,439 @@
#ifndef dplyr_Result_Rank_H
#define dplyr_Result_Rank_H
-namespace dplyr {
- namespace internal {
-
- struct min_rank_increment{
- typedef IntegerVector OutputVector ;
- typedef int scalar_type ;
-
- template <typename Container>
- inline int post_increment( const Container& x, int) const {
- return x.size() ;
- }
-
- template <typename Container>
- inline int pre_increment( const Container& x, int) const {
- return 0 ;
- }
-
- inline int start() const {
- return 1 ;
- }
-
- } ;
+#include <tools/hash.h>
- struct dense_rank_increment{
- typedef IntegerVector OutputVector ;
- typedef int scalar_type ;
+#include <dplyr/GroupedDataFrame.h>
- template <typename Container>
- inline int post_increment( const Container&, int) const {
- return 1 ;
- }
+#include <dplyr/comparisons.h>
+#include <dplyr/visitor.h>
- template <typename Container>
- inline int pre_increment( const Container&, int) const {
- return 0 ;
- }
+#include <dplyr/Order.h>
- inline int start() const {
- return 1 ;
- }
+#include <dplyr/Result/Result.h>
+#include <dplyr/Result/VectorSliceVisitor.h>
- } ;
-
- struct percent_rank_increment{
- typedef NumericVector OutputVector ;
- typedef double scalar_type ;
+namespace dplyr {
+namespace internal {
- template <typename Container>
- inline double post_increment( const Container& x, int m) const {
- return (double)x.size() / ( m - 1 ) ;
- }
+struct min_rank_increment {
+ typedef IntegerVector OutputVector;
+ typedef int scalar_type;
- template <typename Container>
- inline double pre_increment( const Container& x, int m) const {
- return 0.0 ;
- }
+ template <typename Container>
+ inline int post_increment(const Container& x, int) const {
+ return x.size();
+ }
- inline double start() const {
- return 0.0 ;
- }
+ template <typename Container>
+ inline int pre_increment(const Container&, int) const {
+ return 0;
+ }
+ inline int start() const {
+ return 1;
+ }
- } ;
+};
- struct cume_dist_increment{
- typedef NumericVector OutputVector ;
- typedef double scalar_type ;
+struct dense_rank_increment {
+ typedef IntegerVector OutputVector;
+ typedef int scalar_type;
- template <typename Container>
- inline double post_increment( const Container& x, int m) const {
- return 0.0 ;
- }
+ template <typename Container>
+ inline int post_increment(const Container&, int) const {
+ return 1;
+ }
- template <typename Container>
- inline double pre_increment( const Container& x, int m) const {
- return (double)x.size() / m ;
- }
+ template <typename Container>
+ inline int pre_increment(const Container&, int) const {
+ return 0;
+ }
- inline double start() const {
- return 0.0 ;
- }
- } ;
+ inline int start() const {
+ return 1;
+ }
- }
+};
- template <int RTYPE, bool ascending=true>
- class RankComparer : public comparisons<RTYPE> {
- public:
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-
- inline bool operator()(STORAGE lhs, STORAGE rhs) const {
- return comparisons<RTYPE>::is_less(lhs,rhs) ;
- }
- } ;
-
- template <int RTYPE>
- class RankComparer<RTYPE,false> : public comparisons<RTYPE> {
- public:
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
- inline bool operator()(STORAGE lhs, STORAGE rhs) const{
- return comparisons<RTYPE>::is_greater(lhs,rhs) ;
- }
- } ;
+struct percent_rank_increment {
+ typedef NumericVector OutputVector;
+ typedef double scalar_type;
- template <int RTYPE>
- class RankEqual : public comparisons<RTYPE> {
- public:
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
+ template <typename Container>
+ inline double post_increment(const Container& x, int m) const {
+ return (double)x.size() / (m - 1);
+ }
- inline bool operator()(STORAGE lhs, STORAGE rhs) const {
- return comparisons<RTYPE>::equal_or_both_na(lhs,rhs) ;
- }
- } ;
-
- // powers both dense_rank and min_rank, see dplyr.cpp for how it is used
- template <int RTYPE, typename Increment, bool ascending = true>
- class Rank_Impl : public Result, public Increment {
- public:
- typedef typename Increment::OutputVector OutputVector ;
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-
- typedef VectorSliceVisitor<RTYPE> Slice ;
- typedef RankComparer<RTYPE,ascending> Comparer ;
- typedef RankEqual<RTYPE> Equal ;
-
- typedef dplyr_hash_map<STORAGE, std::vector<int>, boost::hash<STORAGE>, Equal > Map ;
- typedef std::map<STORAGE,const std::vector<int>*, Comparer> oMap ;
-
- Rank_Impl(SEXP data_) : data(data_), map() {}
-
- virtual SEXP process( const GroupedDataFrame& gdf) {
- int ng = gdf.ngroups() ;
- int n = gdf.nrows() ;
- if( n == 0 ) return IntegerVector(0) ;
- GroupedDataFrame::group_iterator git = gdf.group_begin();
- OutputVector out = no_init(n) ;
- for( int i=0; i<ng; i++, ++git){
- process_slice( out, *git ) ;
- }
- return out ;
- }
+ template <typename Container>
+ inline double pre_increment(const Container&, int) const {
+ return 0.0;
+ }
- virtual SEXP process( const RowwiseDataFrame& gdf) {
- return IntegerVector( gdf.nrows(), 1 ) ;
- }
+ inline double start() const {
+ return 0.0;
+ }
- virtual SEXP process( const FullDataFrame& df ) {
- int n = df.nrows() ;
- if( n == 0) return IntegerVector(0) ;
- OutputVector out = no_init(n) ;
- process_slice(out, df.get_index() ) ;
- return out ;
- }
- virtual SEXP process( const SlicingIndex& index ){
- int n = index.size() ;
- if( n == 0 ) return IntegerVector(0) ;
- OutputVector out = no_init(n) ;
- process_slice(out, index) ;
- return out ;
- }
+};
- private:
-
- void process_slice( OutputVector& out, const SlicingIndex& index){
- map.clear() ;
- Slice slice(data, index) ;
- int m=index.size() ;
- for( int j=0; j<m; j++) {
- map[ slice[j] ].push_back(j) ;
- }
- STORAGE na = Rcpp::traits::get_na<RTYPE>() ;
- typename Map::const_iterator it = map.find( na ) ;
- if( it != map.end() ){
- m -= it->second.size() ;
- }
-
- oMap ordered;
-
- it = map.begin() ;
- for( ; it != map.end() ; ++it){
- ordered[it->first] = &it->second ;
- }
- typename oMap::const_iterator oit = ordered.begin() ;
- typename Increment::scalar_type j = Increment::start() ;
- for( ; oit != ordered.end(); ++oit){
- STORAGE key = oit->first ;
- const std::vector<int>& chunk = *oit->second ;
- int n = chunk.size() ;
- j += Increment::pre_increment( chunk, m ) ;
- if( Rcpp::traits::is_na<RTYPE>( key ) ){
- typename Increment::scalar_type na =
- Rcpp::traits::get_na< Rcpp::traits::r_sexptype_traits<typename Increment::scalar_type>::rtype >() ;
- for( int k=0; k<n; k++){
- out[ chunk[k] ] = na ;
- }
- } else {
- for( int k=0; k<n; k++){
- out[ chunk[k] ] = j ;
- }
- }
- j += Increment::post_increment( chunk, m ) ;
- }
- }
+struct cume_dist_increment {
+ typedef NumericVector OutputVector;
+ typedef double scalar_type;
+ template <typename Container>
+ inline double post_increment(const Container&, int) const {
+ return 0.0;
+ }
- SEXP data ;
- Map map ;
- } ;
-
- template <int RTYPE, bool ascending=true>
- class RowNumber : public Result {
- public:
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-
- typedef VectorSliceVisitor<RTYPE> Slice ;
- typedef OrderVectorVisitorImpl<RTYPE,ascending,Slice> Visitor ;
- typedef Compare_Single_OrderVisitor<Visitor> Comparer ;
-
- RowNumber(SEXP data_) : data(data_) {}
-
- virtual SEXP process( const GroupedDataFrame& gdf) {
- std::vector<int> tmp( gdf.max_group_size() ) ;
-
- int ng = gdf.ngroups() ;
- int n = gdf.nrows() ;
- if( n == 0 ) return IntegerVector(0) ;
- GroupedDataFrame::group_iterator git = gdf.group_begin();
- IntegerVector out(n) ;
- for( int i=0; i<ng; i++, ++git){
- SlicingIndex index = *git ;
-
- // tmp <- 0:(m-1)
- int m = index.size() ;
- for( int j=0; j<m; j++) tmp[j] = j ;
-
- Slice slice(data, index) ;
- // order( gdf.group(i) )
- std::sort( tmp.begin(), tmp.begin() + m,
- Comparer( Visitor( slice ) )
- ) ;
- int j=m-1;
- for( ; j>=0; j--){
- if( Rcpp::traits::is_na<RTYPE>( slice[ tmp[j] ] ) ){
- m-- ;
- out[ index[j] ] = NA_INTEGER ;
- } else {
- break ;
- }
- }
- for( ; j>=0; j--){
- out[ index[j] ] = tmp[j] + 1 ;
- }
- }
- return out ;
+ template <typename Container>
+ inline double pre_increment(const Container& x, int m) const {
+ return (double)x.size() / m;
+ }
- }
+ inline double start() const {
+ return 0.0;
+ }
+};
- virtual SEXP process( const RowwiseDataFrame& gdf) {
- return IntegerVector( gdf.nrows(), 1 ) ;
- }
+}
- virtual SEXP process( const FullDataFrame& df ) {
- return process( df.get_index() ) ;
- }
- virtual SEXP process( const SlicingIndex& index ){
- int nrows = index.size() ;
- if( nrows == 0 ) return IntegerVector(0) ;
- IntegerVector x = seq(0, nrows -1 ) ;
- Slice slice(data, index) ;
- std::sort( x.begin(), x.end(),
- Comparer( Visitor( slice ) )
- ) ;
- IntegerVector out = no_init(nrows);
- int j=nrows-1 ;
- for( ; j>=0; j--){
- if( Rcpp::traits::is_na<RTYPE>( slice[ x[j] ] ) ){
- out[ x[j] ] = NA_INTEGER ;
- } else {
- break ;
- }
- }
- for( ; j>=0; j--){
- out[ x[j] ] = j + 1 ;
- }
- return out ;
- }
+template <int RTYPE, bool ascending = true>
+class RankComparer {
+ typedef comparisons<RTYPE> compare;
+
+public:
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+
+ inline bool operator()(STORAGE lhs, STORAGE rhs) const {
+ return compare::is_less(lhs, rhs);
+ }
+};
+
+template <int RTYPE>
+class RankComparer<RTYPE, false> {
+ typedef comparisons<RTYPE> compare;
+
+public:
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+ inline bool operator()(STORAGE lhs, STORAGE rhs) const {
+ return compare::is_greater(lhs, rhs);
+ }
+};
+
+template <int RTYPE>
+class RankEqual {
+ typedef comparisons<RTYPE> compare;
+
+public:
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+
+ inline bool operator()(STORAGE lhs, STORAGE rhs) const {
+ return compare::equal_or_both_na(lhs, rhs);
+ }
+};
+
+// powers both dense_rank and min_rank, see dplyr.cpp for how it is used
+template <int RTYPE, typename Increment, bool ascending = true>
+class Rank_Impl : public Result, public Increment {
+public:
+ typedef typename Increment::OutputVector OutputVector;
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+
+ typedef VectorSliceVisitor<RTYPE> Slice;
+ typedef RankComparer<RTYPE, ascending> Comparer;
+ typedef RankEqual<RTYPE> Equal;
+
+ typedef dplyr_hash_map<STORAGE, std::vector<int>, boost::hash<STORAGE>, Equal > Map;
+ typedef std::map<STORAGE, const std::vector<int>*, Comparer> oMap;
+
+ Rank_Impl(SEXP data_) : data(data_), map() {}
+
+ virtual SEXP process(const GroupedDataFrame& gdf) {
+ int ng = gdf.ngroups();
+ int n = gdf.nrows();
+ if (n == 0) return IntegerVector(0);
+ GroupedDataFrame::group_iterator git = gdf.group_begin();
+ OutputVector out = no_init(n);
+ for (int i = 0; i < ng; i++, ++git) {
+ process_slice(out, *git);
+ }
+ return out;
+ }
+
+ virtual SEXP process(const RowwiseDataFrame& gdf) {
+ return IntegerVector(gdf.nrows(), 1);
+ }
+
+ virtual SEXP process(const FullDataFrame& df) {
+ int n = df.nrows();
+ if (n == 0) return IntegerVector(0);
+ OutputVector out = no_init(n);
+ process_slice(out, df.get_index());
+ return out;
+ }
+
+ virtual SEXP process(const SlicingIndex& index) {
+ int n = index.size();
+ if (n == 0) return IntegerVector(0);
+ OutputVector out = no_init(n);
+ process_slice(out, index);
+ return out;
+ }
+
+private:
+
+ void process_slice(OutputVector& out, const SlicingIndex& index) {
+ map.clear();
+ Slice slice(&data, index);
+ int m = index.size();
+ for (int j = 0; j < m; j++) {
+ map[ slice[j] ].push_back(j);
+ }
+ STORAGE na = Rcpp::traits::get_na<RTYPE>();
+ typename Map::const_iterator it = map.find(na);
+ if (it != map.end()) {
+ m -= it->second.size();
+ }
- private:
- SEXP data ;
- } ;
-
- template <int RTYPE, bool ascending=true>
- class Ntile : public Result {
- public:
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-
- typedef VectorSliceVisitor<RTYPE> Slice ;
- typedef OrderVectorVisitorImpl<RTYPE,ascending,Slice> Visitor ;
- typedef Compare_Single_OrderVisitor<Visitor> Comparer ;
-
- Ntile(SEXP data_, double ntiles_ ) : data(data_), ntiles(ntiles_) {}
-
- virtual SEXP process( const GroupedDataFrame& gdf) {
- std::vector<int> tmp( gdf.max_group_size() ) ;
-
- int ng = gdf.ngroups() ;
- int n = gdf.nrows() ;
- if( n == 0 ) return IntegerVector(0) ;
- GroupedDataFrame::group_iterator git = gdf.group_begin();
- IntegerVector out(n) ;
- for( int i=0; i<ng; i++, ++git){
- SlicingIndex index = *git ;
-
- // tmp <- 0:(m-1)
- int m = index.size() ;
- for( int j=0; j<m; j++) tmp[j] = j ;
- Slice slice(data, index ) ;
-
- // order( gdf.group(i) )
- std::sort( tmp.begin(), tmp.begin() + m,
- Comparer( Visitor( slice ) )
- ) ;
- int j=m-1 ;
- for( ; j>= 0; j-- ){
- if( Rcpp::traits::is_na<RTYPE>(slice[tmp[j]]) ){
- out[index[j]] = NA_INTEGER ;
- m-- ;
- } else {
- break ;
- }
- }
- for( ; j>=0; j--) {
- out[ index[j] ] = (int)floor( (ntiles * tmp[j]) / m ) + 1;
- }
- }
- return out ;
+ oMap ordered;
+ it = map.begin();
+ for (; it != map.end(); ++it) {
+ ordered[it->first] = &it->second;
+ }
+ typename oMap::const_iterator oit = ordered.begin();
+ typename Increment::scalar_type j = Increment::start();
+ for (; oit != ordered.end(); ++oit) {
+ STORAGE key = oit->first;
+ const std::vector<int>& chunk = *oit->second;
+ int n = chunk.size();
+ j += Increment::pre_increment(chunk, m);
+ if (Rcpp::traits::is_na<RTYPE>(key)) {
+ typename Increment::scalar_type inc_na =
+ Rcpp::traits::get_na< Rcpp::traits::r_sexptype_traits<typename Increment::scalar_type>::rtype >();
+ for (int k = 0; k < n; k++) {
+ out[ chunk[k] ] = inc_na;
}
-
- virtual SEXP process( const RowwiseDataFrame& gdf) {
- return IntegerVector( gdf.nrows(), 1 ) ;
+ } else {
+ for (int k = 0; k < n; k++) {
+ out[ chunk[k] ] = j;
}
-
- virtual SEXP process( const FullDataFrame& df ) {
- return process( df.get_index() ) ;
+ }
+ j += Increment::post_increment(chunk, m);
+ }
+ }
+
+
+ Vector<RTYPE> data;
+ Map map;
+};
+
+template <int RTYPE, bool ascending = true>
+class RowNumber : public Result {
+public:
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+
+ typedef VectorSliceVisitor<RTYPE> Slice;
+ typedef OrderVectorVisitorImpl<RTYPE, ascending, Slice> Visitor;
+ typedef Compare_Single_OrderVisitor<Visitor> Comparer;
+
+ RowNumber(SEXP data_) : data(data_) {}
+
+ virtual SEXP process(const GroupedDataFrame& gdf) {
+ std::vector<int> tmp(gdf.max_group_size());
+
+ int ng = gdf.ngroups();
+ int n = gdf.nrows();
+ if (n == 0) return IntegerVector(0);
+ GroupedDataFrame::group_iterator git = gdf.group_begin();
+ IntegerVector out(n);
+ for (int i = 0; i < ng; i++, ++git) {
+ const SlicingIndex& index = *git;
+
+ // tmp <- 0:(m-1)
+ int m = index.size();
+ for (int j = 0; j < m; j++) tmp[j] = j;
+
+ Slice slice(&data, index);
+ // order( gdf.group(i) )
+ Visitor visitor(slice);
+ Comparer comparer(visitor);
+ std::sort(tmp.begin(), tmp.begin() + m, comparer);
+ int j = m - 1;
+ for (; j >= 0; j--) {
+ if (Rcpp::traits::is_na<RTYPE>(slice[ tmp[j] ])) {
+ m--;
+ out[ index[j] ] = NA_INTEGER;
+ } else {
+ break;
}
-
- virtual SEXP process( const SlicingIndex& index ){
- int nrows = index.size() ;
- if( nrows == 0 ) return IntegerVector(0) ;
- IntegerVector x = seq(0, nrows -1 ) ;
- Slice slice(data, index) ;
- Visitor visitor( slice ) ;
- std::sort( x.begin(), x.end(), Comparer( visitor ) ) ;
- IntegerVector out = no_init(nrows);
- int i=nrows-1 ;
- for( ; i>=0; i--){
- if( Rcpp::traits::is_na<RTYPE>(slice[x[i]] ) ) {
- nrows-- ;
- out[x[i]] = NA_INTEGER ;
- } else {
- break ;
- }
- }
-
- for( ; i>=0; i--){
- out[ x[i] ] = (int)floor(ntiles * i / nrows ) + 1;
- }
- return out ;
+ }
+ for (; j >= 0; j--) {
+ out[ index[j] ] = tmp[j] + 1;
+ }
+ }
+ return out;
+
+ }
+
+ virtual SEXP process(const RowwiseDataFrame& gdf) {
+ return IntegerVector(gdf.nrows(), 1);
+ }
+
+ virtual SEXP process(const FullDataFrame& df) {
+ return process(df.get_index());
+ }
+
+ virtual SEXP process(const SlicingIndex& index) {
+ int nrows = index.size();
+ if (nrows == 0) return IntegerVector(0);
+ IntegerVector x = seq(0, nrows - 1);
+ Slice slice(&data, index);
+ Visitor visitor(slice);
+ std::sort(x.begin(), x.end(), Comparer(visitor));
+ IntegerVector out = no_init(nrows);
+ int j = nrows - 1;
+ for (; j >= 0; j--) {
+ if (Rcpp::traits::is_na<RTYPE>(slice[ x[j] ])) {
+ out[ x[j] ] = NA_INTEGER;
+ } else {
+ break;
+ }
+ }
+ for (; j >= 0; j--) {
+ out[ x[j] ] = j + 1;
+ }
+ return out;
+ }
+
+private:
+ Vector<RTYPE> data;
+};
+
+template <int RTYPE, bool ascending = true>
+class Ntile : public Result {
+public:
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+
+ typedef VectorSliceVisitor<RTYPE> Slice;
+ typedef OrderVectorVisitorImpl<RTYPE, ascending, Slice> Visitor;
+ typedef Compare_Single_OrderVisitor<Visitor> Comparer;
+
+ Ntile(SEXP data_, double ntiles_) : data(data_), ntiles(ntiles_) {}
+
+ virtual SEXP process(const GroupedDataFrame& gdf) {
+ std::vector<int> tmp(gdf.max_group_size());
+
+ int ng = gdf.ngroups();
+ int n = gdf.nrows();
+ if (n == 0) return IntegerVector(0);
+ GroupedDataFrame::group_iterator git = gdf.group_begin();
+ IntegerVector out(n);
+ for (int i = 0; i < ng; i++, ++git) {
+ const SlicingIndex& index = *git;
+
+ // tmp <- 0:(m-1)
+ int m = index.size();
+ for (int j = 0; j < m; j++) tmp[j] = j;
+ Slice slice(&data, index);
+
+ // order( gdf.group(i) )
+ Visitor visitor(slice);
+ Comparer comparer(visitor);
+ std::sort(tmp.begin(), tmp.begin() + m, comparer);
+ int j = m - 1;
+ for (; j >= 0; j--) {
+ if (Rcpp::traits::is_na<RTYPE>(slice[tmp[j]])) {
+ out[index[j]] = NA_INTEGER;
+ m--;
+ } else {
+ break;
}
+ }
+ for (; j >= 0; j--) {
+ out[ index[j] ] = (int)floor((ntiles * tmp[j]) / m) + 1;
+ }
+ }
+ return out;
+
+ }
+
+ virtual SEXP process(const RowwiseDataFrame& gdf) {
+ return IntegerVector(gdf.nrows(), 1);
+ }
+
+ virtual SEXP process(const FullDataFrame& df) {
+ return process(df.get_index());
+ }
+
+ virtual SEXP process(const SlicingIndex& index) {
+ int nrows = index.size();
+ if (nrows == 0) return IntegerVector(0);
+ IntegerVector x = seq(0, nrows - 1);
+ Slice slice(&data, index);
+ Visitor visitor(slice);
+ std::sort(x.begin(), x.end(), Comparer(visitor));
+ IntegerVector out = no_init(nrows);
+ int i = nrows - 1;
+ for (; i >= 0; i--) {
+ if (Rcpp::traits::is_na<RTYPE>(slice[x[i]])) {
+ nrows--;
+ out[x[i]] = NA_INTEGER;
+ } else {
+ break;
+ }
+ }
- private:
- SEXP data ;
- double ntiles ;
- } ;
-
- class RowNumber_0 : public Result {
- public:
-
- virtual SEXP process( const GroupedDataFrame& gdf ){
- int n = gdf.nrows(), ng = gdf.ngroups() ;
- if( n == 0 ) return IntegerVector(0) ;
-
- IntegerVector res = no_init(n) ;
- GroupedDataFrame::group_iterator git = gdf.group_begin() ;
- for( int i=0; i<ng; i++, ++git){
- SlicingIndex index = *git ;
- int m = index.size() ;
- for( int j=0; j<m; j++) res[index[j]] = j + 1 ;
- }
- return res ;
- }
+ for (; i >= 0; i--) {
+ out[ x[i] ] = (int)floor(ntiles * i / nrows) + 1;
+ }
+ return out;
+ }
+
+private:
+ Vector<RTYPE> data;
+ double ntiles;
+};
+
+class RowNumber_0 : public Result {
+public:
+
+ virtual SEXP process(const GroupedDataFrame& gdf) {
+ int n = gdf.nrows(), ng = gdf.ngroups();
+ if (n == 0) return IntegerVector(0);
+
+ IntegerVector res = no_init(n);
+ GroupedDataFrame::group_iterator git = gdf.group_begin();
+ for (int i = 0; i < ng; i++, ++git) {
+ const SlicingIndex& index = *git;
+ int m = index.size();
+ for (int j = 0; j < m; j++) res[index[j]] = j + 1;
+ }
+ return res;
+ }
- virtual SEXP process( const RowwiseDataFrame& gdf) {
- return IntegerVector( gdf.nrows(), 1 ) ;
- }
+ virtual SEXP process(const RowwiseDataFrame& gdf) {
+ return IntegerVector(gdf.nrows(), 1);
+ }
- virtual SEXP process( const FullDataFrame& df ) {
- if( df.nrows() == 0 ) return IntegerVector(0) ;
- IntegerVector res = seq(1, df.nrows() ) ;
- return res ;
- }
+ virtual SEXP process(const FullDataFrame& df) {
+ if (df.nrows() == 0) return IntegerVector(0);
+ IntegerVector res = seq(1, df.nrows());
+ return res;
+ }
- virtual SEXP process( const SlicingIndex& index ){
- if( index.size() == 0 ) return IntegerVector(0) ;
- IntegerVector res = seq(1, index.size() ) ;
- return res ;
- }
+ virtual SEXP process(const SlicingIndex& index) {
+ if (index.size() == 0) return IntegerVector(0);
+ IntegerVector res = seq(1, index.size());
+ return res;
+ }
- } ;
+};
}
+#include <dplyr/visitor_impl.h>
+
#endif
diff --git a/inst/include/dplyr/Result/Result.h b/inst/include/dplyr/Result/Result.h
index dbc122f..8bd28a4 100644
--- a/inst/include/dplyr/Result/Result.h
+++ b/inst/include/dplyr/Result/Result.h
@@ -1,25 +1,30 @@
#ifndef dplyr_Result_H
#define dplyr_Result_H
+#include <dplyr/RowwiseDataFrame.h>
+#include <dplyr/GroupedDataFrame.h>
+#include <dplyr/FullDataFrame.h>
+#include <tools/SlicingIndex.h>
+
namespace dplyr {
- class Result {
- public:
- Result(){}
+class Result {
+public:
+ Result() {}
- virtual ~Result(){} ;
+ virtual ~Result() {};
- virtual SEXP process( const RowwiseDataFrame& gdf) = 0 ;
+ virtual SEXP process(const RowwiseDataFrame& gdf) = 0;
- virtual SEXP process( const GroupedDataFrame& gdf) = 0 ;
+ virtual SEXP process(const GroupedDataFrame& gdf) = 0;
- virtual SEXP process( const FullDataFrame& df ) = 0 ;
+ virtual SEXP process(const FullDataFrame& df) = 0;
- virtual SEXP process( const SlicingIndex& index ){
- return R_NilValue ;
- }
+ virtual SEXP process(const SlicingIndex&) {
+ return R_NilValue;
+ }
- } ;
+};
} // namespace dplyr
diff --git a/inst/include/dplyr/Result/ResultSet.h b/inst/include/dplyr/Result/ResultSet.h
deleted file mode 100644
index 7bdf5a2..0000000
--- a/inst/include/dplyr/Result/ResultSet.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef dplyr_Result_ResultSet_H
-#define dplyr_Result_ResultSet_H
-
-namespace dplyr {
-
- class ResultSet {
- public:
- ResultSet( ) : results(), names(), n(0) {}
-
- void add_result( const std::string& name, Result* result ){
- results.push_back( result ) ;
- names.push_back( name ) ;
- n++ ;
- }
-
- Result* get(int k){ return results[k] ; }
- inline int size() const { return n ; }
- Rcpp::String name(int k) const { return names[k] ; }
-
- private:
- pointer_vector<Result> results ;
- std::vector<std::string> names ;
- int n ;
- } ;
-
-}
-
-#endif
diff --git a/inst/include/dplyr/Result/RowwiseSubset.h b/inst/include/dplyr/Result/RowwiseSubset.h
index 2a8a119..4da3fcd 100644
--- a/inst/include/dplyr/Result/RowwiseSubset.h
+++ b/inst/include/dplyr/Result/RowwiseSubset.h
@@ -1,85 +1,89 @@
#ifndef dplyr_RowwiseSubset_H
#define dplyr_RowwiseSubset_H
+#include <tools/ShrinkableVector.h>
+#include <tools/utils.h>
+
+#include <dplyr/checks.h>
+
+#include <dplyr/Result/GroupedSubsetBase.h>
+
namespace dplyr {
- class RowwiseSubset {
- public:
- RowwiseSubset(){} ;
- virtual ~RowwiseSubset(){} ;
- virtual SEXP get( const SlicingIndex& indices ) = 0 ;
- virtual SEXP get_variable() const = 0 ;
- virtual bool is_summary() const = 0;
- } ;
-
- template <int RTYPE>
- class RowwiseSubsetTemplate : public RowwiseSubset {
- public:
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
- RowwiseSubsetTemplate( SEXP x ) :
- object(x), output(1), start( Rcpp::internal::r_vector_start<RTYPE>(object) )
- {
- copy_most_attributes( output, x) ;
- SET_DPLYR_SHRINKABLE_VECTOR( (SEXP)output) ;
- }
-
- ~RowwiseSubsetTemplate(){
- UNSET_DPLYR_SHRINKABLE_VECTOR( (SEXP)output) ;
- }
-
- virtual SEXP get( const SlicingIndex& indices ) {
- output[0] = start[ indices.group() ] ;
- return output ;
- }
- virtual SEXP get_variable() const {
- return object ;
- }
- virtual bool is_summary() const {
- return false;
- }
-
- private:
- SEXP object ;
- Vector<RTYPE> output ;
- STORAGE* start ;
- } ;
-
- template <>
- class RowwiseSubsetTemplate<VECSXP> : public RowwiseSubset {
- public:
- RowwiseSubsetTemplate( SEXP x) :
- object(x), start( Rcpp::internal::r_vector_start<VECSXP>(object) )
- {}
-
- virtual SEXP get( const SlicingIndex& indices ) {
- return start[ indices.group() ] ;
- }
- virtual SEXP get_variable() const {
- return object ;
- }
- virtual bool is_summary() const {
- return false;
- }
-
- private:
- SEXP object ;
- SEXP* start ;
- } ;
-
-
- inline RowwiseSubset* rowwise_subset(SEXP x){
- switch( check_supported_type(x) ){
- case DPLYR_INTSXP: return new RowwiseSubsetTemplate<INTSXP>(x) ;
- case DPLYR_REALSXP: return new RowwiseSubsetTemplate<REALSXP>(x) ;
- case DPLYR_LGLSXP: return new RowwiseSubsetTemplate<LGLSXP>(x) ;
- case DPLYR_STRSXP: return new RowwiseSubsetTemplate<STRSXP>(x) ;
- case DPLYR_CPLXSXP: return new RowwiseSubsetTemplate<CPLXSXP>(x) ;
- case DPLYR_VECSXP: return new RowwiseSubsetTemplate<VECSXP>(x) ;
- }
-
- stop("Unreachable") ;
- return 0 ;
- }
+template <int RTYPE>
+class RowwiseSubsetTemplate : public RowwiseSubset {
+public:
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+ RowwiseSubsetTemplate(SEXP x) :
+ object(x), output(1), start(Rcpp::internal::r_vector_start<RTYPE>(object))
+ {
+ copy_most_attributes(output, x);
+ SET_DPLYR_SHRINKABLE_VECTOR((SEXP)output);
+ }
+
+ ~RowwiseSubsetTemplate() {
+ UNSET_DPLYR_SHRINKABLE_VECTOR((SEXP)output);
+ }
+
+ virtual SEXP get(const SlicingIndex& indices) {
+ output[0] = start[ indices.group() ];
+ return output;
+ }
+ virtual SEXP get_variable() const {
+ return object;
+ }
+ virtual bool is_summary() const {
+ return false;
+ }
+
+private:
+ SEXP object;
+ Vector<RTYPE> output;
+ STORAGE* start;
+};
+
+template <>
+class RowwiseSubsetTemplate<VECSXP> : public RowwiseSubset {
+public:
+ RowwiseSubsetTemplate(SEXP x) :
+ object(x), start(Rcpp::internal::r_vector_start<VECSXP>(object))
+ {}
+
+ virtual SEXP get(const SlicingIndex& indices) {
+ return start[ indices.group() ];
+ }
+ virtual SEXP get_variable() const {
+ return object;
+ }
+ virtual bool is_summary() const {
+ return false;
+ }
+
+private:
+ SEXP object;
+ SEXP* start;
+};
+
+
+inline RowwiseSubset* rowwise_subset(SEXP x) {
+ switch (check_supported_type(x)) {
+ case DPLYR_INTSXP:
+ return new RowwiseSubsetTemplate<INTSXP>(x);
+ case DPLYR_REALSXP:
+ return new RowwiseSubsetTemplate<REALSXP>(x);
+ case DPLYR_LGLSXP:
+ return new RowwiseSubsetTemplate<LGLSXP>(x);
+ case DPLYR_STRSXP:
+ return new RowwiseSubsetTemplate<STRSXP>(x);
+ case DPLYR_CPLXSXP:
+ return new RowwiseSubsetTemplate<CPLXSXP>(x);
+ case DPLYR_VECSXP:
+ return new RowwiseSubsetTemplate<VECSXP>(x);
+ }
+
+ stop("Unreachable");
+ return 0;
+}
}
diff --git a/inst/include/dplyr/Result/Sd.h b/inst/include/dplyr/Result/Sd.h
index 7def7ef..186b2f5 100644
--- a/inst/include/dplyr/Result/Sd.h
+++ b/inst/include/dplyr/Result/Sd.h
@@ -1,26 +1,28 @@
#ifndef dplyr_Result_Sd_H
#define dplyr_Result_Sd_H
+#include <dplyr/Result/Processor.h>
+
namespace dplyr {
- template <int RTYPE, bool NA_RM>
- class Sd : public Processor<REALSXP, Sd<RTYPE,NA_RM> > {
- public:
- typedef Processor<REALSXP, Sd<RTYPE,NA_RM> > Base ;
+template <int RTYPE, bool NA_RM>
+class Sd : public Processor<REALSXP, Sd<RTYPE, NA_RM> > {
+public:
+ typedef Processor<REALSXP, Sd<RTYPE, NA_RM> > Base;
- Sd(SEXP x, bool is_summary = false) :
- Base(x),
- var(x, is_summary)
- {}
- ~Sd(){}
+ Sd(SEXP x, bool is_summary = false) :
+ Base(x),
+ var(x, is_summary)
+ {}
+ ~Sd() {}
- inline double process_chunk( const SlicingIndex& indices ){
- return sqrt( var.process_chunk( indices ) );
- }
+ inline double process_chunk(const SlicingIndex& indices) {
+ return sqrt(var.process_chunk(indices));
+ }
- private:
- Var<RTYPE,NA_RM> var ;
- } ;
+private:
+ Var<RTYPE, NA_RM> var;
+};
}
diff --git a/inst/include/dplyr/Result/Sum.h b/inst/include/dplyr/Result/Sum.h
index 2460541..16ca822 100644
--- a/inst/include/dplyr/Result/Sum.h
+++ b/inst/include/dplyr/Result/Sum.h
@@ -1,102 +1,104 @@
#ifndef dplyr_Result_Sum_H
#define dplyr_Result_Sum_H
+#include <dplyr/Result/Processor.h>
+
namespace dplyr {
namespace internal {
- // this one is actually only used for RTYPE = REALSXP and NA_RM = true
- template <int RTYPE, bool NA_RM, typename Index>
- struct Sum {
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
- static STORAGE process(typename Rcpp::traits::storage_type<RTYPE>::type* ptr, const Index& indices) {
- long double res = 0 ;
- int n = indices.size() ;
- for( int i=0; i<n; i++){
- double value = ptr[indices[i]] ;
- if( ! Rcpp::traits::is_na<RTYPE>( value ) ) res += value ;
- }
- return (double)res ;
- }
- } ;
+// this one is actually only used for RTYPE = REALSXP and NA_RM = true
+template <int RTYPE, bool NA_RM, typename Index>
+struct Sum {
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+ static STORAGE process(typename Rcpp::traits::storage_type<RTYPE>::type* ptr, const Index& indices) {
+ long double res = 0;
+ int n = indices.size();
+ for (int i = 0; i < n; i++) {
+ double value = ptr[indices[i]];
+ if (! Rcpp::traits::is_na<RTYPE>(value)) res += value;
+ }
+ return (double)res;
+ }
+};
- template <typename Index>
- struct Sum<INTSXP,true, Index> {
- static int process( int* ptr, const Index& indices){
- long double res = 0 ;
- int n = indices.size() ;
- for( int i=0; i<n; i++){
- int value = ptr[indices[i]] ;
- if( ! Rcpp::traits::is_na<INTSXP>( value ) ) res += value ;
- }
- if(res > INT_MAX || res <= INT_MIN){
- warning( "integer overflow - use sum(as.numeric(.))" ) ;
- return IntegerVector::get_na() ;
- }
- return (int)res ;
- }
- };
+template <typename Index>
+struct Sum<INTSXP, true, Index> {
+ static int process(int* ptr, const Index& indices) {
+ long double res = 0;
+ int n = indices.size();
+ for (int i = 0; i < n; i++) {
+ int value = ptr[indices[i]];
+ if (! Rcpp::traits::is_na<INTSXP>(value)) res += value;
+ }
+ if (res > INT_MAX || res <= INT_MIN) {
+ warning("integer overflow - use sum(as.numeric(.))");
+ return IntegerVector::get_na();
+ }
+ return (int)res;
+ }
+};
- template <typename Index>
- struct Sum<INTSXP, false, Index>{
- static int process( int* ptr, const Index& indices ){
- long double res = 0 ;
- int n = indices.size() ;
- for( int i=0; i<n; i++){
- int value = ptr[indices[i]] ;
- if( Rcpp::traits::is_na<INTSXP>( value ) ){
- return NA_INTEGER ;
- }
- res += value ;
- }
- if(res > INT_MAX || res <= INT_MIN){
- warning( "integer overflow - use sum(as.numeric(.))" ) ;
- return IntegerVector::get_na() ;
- }
- return (int)res ;
- }
- } ;
+template <typename Index>
+struct Sum<INTSXP, false, Index> {
+ static int process(int* ptr, const Index& indices) {
+ long double res = 0;
+ int n = indices.size();
+ for (int i = 0; i < n; i++) {
+ int value = ptr[indices[i]];
+ if (Rcpp::traits::is_na<INTSXP>(value)) {
+ return NA_INTEGER;
+ }
+ res += value;
+ }
+ if (res > INT_MAX || res <= INT_MIN) {
+ warning("integer overflow - use sum(as.numeric(.))");
+ return IntegerVector::get_na();
+ }
+ return (int)res;
+ }
+};
- template <typename Index>
- struct Sum<REALSXP, false, Index> {
- static double process( double* ptr, const Index& indices ){
- long double res = 0.0 ;
- int n = indices.size() ;
- for( int i=0; i<n; i++){
- // we don't test for NA here because += NA will give NA
- // this is faster in the most common case where there are no NA
- // if there are NA, we could return quicker as in the version for
- // INTSXP above, but we would penalize the most common case
- res += ptr[ indices[i] ] ;
- }
- return (double)res ;
- }
- } ;
+template <typename Index>
+struct Sum<REALSXP, false, Index> {
+ static double process(double* ptr, const Index& indices) {
+ long double res = 0.0;
+ int n = indices.size();
+ for (int i = 0; i < n; i++) {
+ // we don't test for NA here because += NA will give NA
+ // this is faster in the most common case where there are no NA
+ // if there are NA, we could return quicker as in the version for
+ // INTSXP above, but we would penalize the most common case
+ res += ptr[ indices[i] ];
+ }
+ return (double)res;
+ }
+};
} // namespace internal
- template <int RTYPE, bool NA_RM>
- class Sum : public Processor< RTYPE, Sum<RTYPE,NA_RM> > {
- public:
- typedef Processor< RTYPE, Sum<RTYPE,NA_RM> > Base ;
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
+template <int RTYPE, bool NA_RM>
+class Sum : public Processor< RTYPE, Sum<RTYPE, NA_RM> > {
+public:
+ typedef Processor< RTYPE, Sum<RTYPE, NA_RM> > Base;
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
- Sum(SEXP x, bool is_summary_ = false) :
- Base(x),
- data_ptr( Rcpp::internal::r_vector_start<RTYPE>(x) ),
- is_summary(is_summary_)
- {}
- ~Sum(){}
+ Sum(SEXP x, bool is_summary_ = false) :
+ Base(x),
+ data_ptr(Rcpp::internal::r_vector_start<RTYPE>(x)),
+ is_summary(is_summary_)
+ {}
+ ~Sum() {}
- inline STORAGE process_chunk( const SlicingIndex& indices ){
- if( is_summary ) return data_ptr[indices.group()] ;
- return internal::Sum<RTYPE,NA_RM,SlicingIndex>::process(data_ptr, indices) ;
- }
+ inline STORAGE process_chunk(const SlicingIndex& indices) {
+ if (is_summary) return data_ptr[indices.group()];
+ return internal::Sum<RTYPE, NA_RM, SlicingIndex>::process(data_ptr, indices);
+ }
- STORAGE* data_ptr ;
- bool is_summary ;
- } ;
+ STORAGE* data_ptr;
+ bool is_summary;
+};
}
diff --git a/inst/include/dplyr/Result/Var.h b/inst/include/dplyr/Result/Var.h
index 2faeb59..390c405 100644
--- a/inst/include/dplyr/Result/Var.h
+++ b/inst/include/dplyr/Result/Var.h
@@ -1,86 +1,90 @@
#ifndef dplyr_Result_Var_H
#define dplyr_Result_Var_H
+#include <dplyr/Result/Processor.h>
+
namespace dplyr {
-namespace internal{
- inline double square(double x){ return x*x ; }
+namespace internal {
+inline double square(double x) {
+ return x * x;
+}
}
- // version for NA_RM = false
- template <int RTYPE, bool NA_RM>
- class Var : public Processor<REALSXP, Var<RTYPE,NA_RM> > {
- public:
- typedef Processor<REALSXP, Var<RTYPE,NA_RM> > Base ;
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-
- Var(SEXP x, bool is_summary_ = false) :
- Base(x),
- data_ptr( Rcpp::internal::r_vector_start<RTYPE>(x) ),
- is_summary(is_summary_)
- {}
- ~Var(){}
-
- inline double process_chunk( const SlicingIndex& indices ){
- if( is_summary ) return NA_REAL ;
-
- int n=indices.size() ;
- if( n == 1 ) return NA_REAL ;
- double m = internal::Mean_internal<RTYPE,NA_RM, SlicingIndex>::process( data_ptr, indices );
-
- if( !R_FINITE(m) ) return m ;
-
- double sum = 0.0 ;
- for( int i=0; i<n; i++){
- sum += internal::square( data_ptr[indices[i]] - m ) ;
- }
- return sum / ( n - 1 );
- }
-
- private:
- STORAGE* data_ptr ;
- bool is_summary ;
- } ;
-
-
- // version for NA_RM = true
- template <int RTYPE>
- class Var<RTYPE,true> : public Processor<REALSXP, Var<RTYPE,true> > {
- public:
- typedef Processor<REALSXP, Var<RTYPE,true> > Base ;
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-
- Var(SEXP x, bool is_summary_ = false) :
- Base(x),
- data_ptr( Rcpp::internal::r_vector_start<RTYPE>(x) ),
- is_summary(is_summary_)
- {}
- ~Var(){}
-
- inline double process_chunk( const SlicingIndex& indices ){
- if( is_summary ) return NA_REAL ;
-
- int n=indices.size() ;
- if( n == 1 ) return NA_REAL ;
- double m = internal::Mean_internal<RTYPE,true,SlicingIndex>::process( data_ptr, indices );
-
- if( !R_FINITE(m) ) return m ;
-
- double sum = 0.0 ;
- int count = 0 ;
- for( int i=0; i<n; i++){
- STORAGE current = data_ptr[indices[i]] ;
- if( Rcpp::Vector<RTYPE>::is_na(current) ) continue ;
- sum += internal::square( current - m ) ;
- count++ ;
- }
- if( count == 1 ) return NA_REAL ;
- return sum / ( count - 1 );
- }
-
- private:
- STORAGE* data_ptr ;
- bool is_summary ;
- } ;
+// version for NA_RM = false
+template <int RTYPE, bool NA_RM>
+class Var : public Processor<REALSXP, Var<RTYPE, NA_RM> > {
+public:
+ typedef Processor<REALSXP, Var<RTYPE, NA_RM> > Base;
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+
+ Var(SEXP x, bool is_summary_ = false) :
+ Base(x),
+ data_ptr(Rcpp::internal::r_vector_start<RTYPE>(x)),
+ is_summary(is_summary_)
+ {}
+ ~Var() {}
+
+ inline double process_chunk(const SlicingIndex& indices) {
+ if (is_summary) return NA_REAL;
+
+ int n = indices.size();
+ if (n == 1) return NA_REAL;
+ double m = internal::Mean_internal<RTYPE, NA_RM, SlicingIndex>::process(data_ptr, indices);
+
+ if (!R_FINITE(m)) return m;
+
+ double sum = 0.0;
+ for (int i = 0; i < n; i++) {
+ sum += internal::square(data_ptr[indices[i]] - m);
+ }
+ return sum / (n - 1);
+ }
+
+private:
+ STORAGE* data_ptr;
+ bool is_summary;
+};
+
+
+// version for NA_RM = true
+template <int RTYPE>
+class Var<RTYPE, true> : public Processor<REALSXP, Var<RTYPE, true> > {
+public:
+ typedef Processor<REALSXP, Var<RTYPE, true> > Base;
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+
+ Var(SEXP x, bool is_summary_ = false) :
+ Base(x),
+ data_ptr(Rcpp::internal::r_vector_start<RTYPE>(x)),
+ is_summary(is_summary_)
+ {}
+ ~Var() {}
+
+ inline double process_chunk(const SlicingIndex& indices) {
+ if (is_summary) return NA_REAL;
+
+ int n = indices.size();
+ if (n == 1) return NA_REAL;
+ double m = internal::Mean_internal<RTYPE, true, SlicingIndex>::process(data_ptr, indices);
+
+ if (!R_FINITE(m)) return m;
+
+ double sum = 0.0;
+ int count = 0;
+ for (int i = 0; i < n; i++) {
+ STORAGE current = data_ptr[indices[i]];
+ if (Rcpp::Vector<RTYPE>::is_na(current)) continue;
+ sum += internal::square(current - m);
+ count++;
+ }
+ if (count == 1) return NA_REAL;
+ return sum / (count - 1);
+ }
+
+private:
+ STORAGE* data_ptr;
+ bool is_summary;
+};
diff --git a/inst/include/dplyr/Result/VectorSliceVisitor.h b/inst/include/dplyr/Result/VectorSliceVisitor.h
index 32f7823..3d74c3a 100644
--- a/inst/include/dplyr/Result/VectorSliceVisitor.h
+++ b/inst/include/dplyr/Result/VectorSliceVisitor.h
@@ -1,38 +1,38 @@
#ifndef dplyr_Result_VectorSliceVisitor_H
#define dplyr_Result_VectorSliceVisitor_H
+#include <tools/wrap_subset.h>
+
namespace dplyr {
- template <int RTYPE>
- class VectorSliceVisitor {
- public:
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-
- VectorSliceVisitor( SEXP data_, const SlicingIndex& index_ ) :
- data(data_),
- ptr( Rcpp::internal::r_vector_start<RTYPE>(data)),
- n(index_.size()),
- index(index_)
- {}
-
- inline STORAGE operator[]( int i) const {
- return ptr[index[i]];
- }
-
- inline int size() const {
- return n ;
- }
-
- inline operator SEXP() const {
- return wrap_subset<RTYPE>(data, index) ;
- }
-
- private:
- SEXP data ;
- STORAGE* ptr ;
- int n ;
- const SlicingIndex& index;
- } ;
+template <int RTYPE>
+class VectorSliceVisitor {
+public:
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+
+ VectorSliceVisitor(const Vector<RTYPE>* data_, const SlicingIndex& index_) :
+ data(*data_),
+ n(index_.size()),
+ index(index_)
+ {}
+
+ inline STORAGE operator[](int i) const {
+ return data[index[i]];
+ }
+
+ inline int size() const {
+ return n;
+ }
+
+ inline operator SEXP() const {
+ return wrap_subset<RTYPE>(data, index);
+ }
+
+private:
+ const Vector<RTYPE>& data;
+ int n;
+ const SlicingIndex& index;
+};
}
diff --git a/inst/include/dplyr/Result/all.h b/inst/include/dplyr/Result/all.h
index cb6ea94..acc5be4 100644
--- a/inst/include/dplyr/Result/all.h
+++ b/inst/include/dplyr/Result/all.h
@@ -1,7 +1,6 @@
#ifndef dplyr_Result_all_H
#define dplyr_Result_all_H
-#include <dplyr/Result/Everything.h>
#include <dplyr/Result/is_smaller.h>
#include <dplyr/Result/GroupedSubset.h>
#include <dplyr/Result/RowwiseSubset.h>
@@ -13,29 +12,22 @@
#include <dplyr/Result/Sum.h>
#include <dplyr/Result/Var.h>
#include <dplyr/Result/Sd.h>
-#include <dplyr/Result/min.h>
-#include <dplyr/Result/max.h>
-#include <dplyr/Result/CallElementProxy.h>
+#include <dplyr/Result/MinMax.h>
+#include <dplyr/Result/CallElementProxy.h>
#include <dplyr/Result/DelayedProcessor.h>
#include <dplyr/Result/CallbackProcessor.h>
-#include <dplyr/Result/DelayedReducer.h>
+#include <dplyr/Result/ILazySubsets.h>
#include <dplyr/Result/LazySubsets.h>
#include <dplyr/Result/LazyGroupedSubsets.h>
#include <dplyr/Result/LazyRowwiseSubsets.h>
-#include <dplyr/Result/GroupedHybridCall.h>
-#include <dplyr/Result/GroupedCallProxy.h>
#include <dplyr/Result/GroupedCallReducer.h>
#include <dplyr/Result/CallProxy.h>
-#include <dplyr/Result/ResultSet.h>
-#include <dplyr/Result/factories.h>
-
#include <dplyr/Result/VectorSliceVisitor.h>
#include <dplyr/Result/Rank.h>
#include <dplyr/Result/ConstantResult.h>
-#include <dplyr/Result/Mutater.h>
#include <dplyr/Result/Lead.h>
#include <dplyr/Result/Lag.h>
#include <dplyr/Result/CumSum.h>
diff --git a/inst/include/dplyr/Result/factories.h b/inst/include/dplyr/Result/factories.h
deleted file mode 100644
index 8099f24..0000000
--- a/inst/include/dplyr/Result/factories.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef dplyr_Result_factories_H
-#define dplyr_Result_factories_H
-
-namespace dplyr {
-
- inline Count* count(){ return new Count ; }
-
- class Reducer_Proxy{
- public:
- Reducer_Proxy( Rcpp::Function fun_, Rcpp::String variable_ ):
- fun(fun_), variable(variable_)
- {}
- Rcpp::Function fun ;
- Rcpp::String variable ;
- } ;
-
- inline Reducer_Proxy reduce( Rcpp::Function fun, Rcpp::String variable){
- return Reducer_Proxy( fun, variable ) ;
- }
-
- #define MAKE_PROXY(PREFIX,_prefix_) \
- class PREFIX##_Proxy { \
- public: \
- PREFIX##_Proxy( Rcpp::String variable_, bool na_rm_ ) : \
- variable(variable_), na_rm(na_rm_){} \
- Rcpp::String variable ; \
- bool na_rm ; \
- } ; \
- inline PREFIX##_Proxy _prefix_( Rcpp::String variable, bool na_rm = false ){\
- return PREFIX##_Proxy( variable, na_rm ) ; \
- }
-
- MAKE_PROXY(Mean,mean)
- MAKE_PROXY(Sum,sum)
- MAKE_PROXY(Min,min)
- MAKE_PROXY(Max,max)
- MAKE_PROXY(Var,var)
- MAKE_PROXY(Sd,sd)
-
- #undef MAKE_PROXY
-}
-
-#endif
diff --git a/inst/include/dplyr/Result/is_smaller.h b/inst/include/dplyr/Result/is_smaller.h
index db43356..9bcc087 100644
--- a/inst/include/dplyr/Result/is_smaller.h
+++ b/inst/include/dplyr/Result/is_smaller.h
@@ -4,14 +4,14 @@
namespace dplyr {
namespace internal {
- template <int RTYPE>
- inline bool is_smaller( typename Rcpp::traits::storage_type<RTYPE>::type lhs, typename Rcpp::traits::storage_type<RTYPE>::type rhs ){
- return lhs < rhs ;
- }
- template <>
- inline bool is_smaller<STRSXP>( SEXP lhs, SEXP rhs ){
- return strcmp( CHAR(lhs), CHAR(rhs) ) < 0;
- }
+template <int RTYPE>
+inline bool is_smaller(typename Rcpp::traits::storage_type<RTYPE>::type lhs, typename Rcpp::traits::storage_type<RTYPE>::type rhs) {
+ return lhs < rhs;
+}
+template <>
+inline bool is_smaller<STRSXP>(SEXP lhs, SEXP rhs) {
+ return strcmp(CHAR(lhs), CHAR(rhs)) < 0;
+}
} // namespace internal
} // namespace dplyr
diff --git a/inst/include/dplyr/Result/max.h b/inst/include/dplyr/Result/max.h
deleted file mode 100644
index 49965fa..0000000
--- a/inst/include/dplyr/Result/max.h
+++ /dev/null
@@ -1,82 +0,0 @@
-#ifndef dplyr_Result_Max_H
-#define dplyr_Result_Max_H
-
-namespace dplyr {
-
- template <int RTYPE, bool NA_RM>
- class Max : public Processor<RTYPE, Max<RTYPE,NA_RM> > {
- public:
- typedef Processor<RTYPE, Max<RTYPE,NA_RM> > Base ;
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-
- Max(SEXP x, bool is_summary_ = false) :
- Base(x),
- data_ptr( Rcpp::internal::r_vector_start<RTYPE>(x) ),
- is_summary(is_summary_) {}
- ~Max(){}
-
- STORAGE process_chunk( const SlicingIndex& indices ){
- if( indices.size() == 0) return R_NegInf ;
- if( is_summary ) return data_ptr[indices.group()] ;
- int n = indices.size() ;
-
- // find the first non NA value
- STORAGE res = data_ptr[ indices[0] ] ;
- int i=1 ;
- while( i<n && Rcpp::Vector<RTYPE>::is_na(res) ){
- res = data_ptr[ indices[i++] ] ;
- }
-
- // we enter this loop if we did not scan the full vector
- if( i < n ) for( ; i<n; i++){
- STORAGE current = data_ptr[indices[i]] ;
- if( !Rcpp::Vector<RTYPE>::is_na(current) && internal::is_smaller<RTYPE>( res, current ) ) res = current ;
- }
- return res ;
- }
-
- private:
- STORAGE* data_ptr ;
- bool is_summary ;
- } ;
-
- // quit early version for NA_RM = false
- template <int RTYPE>
- class Max<RTYPE,false> : public Processor<RTYPE, Max<RTYPE,false> > {
- public:
- typedef Processor<RTYPE, Max<RTYPE,false> > Base ;
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-
- Max(SEXP x, bool is_summary_ = false) :
- Base(x),
- data_ptr( Rcpp::internal::r_vector_start<RTYPE>(x) ),
- is_summary(is_summary_) {}
- ~Max(){}
-
- STORAGE process_chunk( const SlicingIndex& indices ){
- if( indices.size() == 0) return R_NegInf ;
- if( is_summary ) return data_ptr[indices.group()] ;
-
- int n = indices.size() ;
-
- // find the first non NA value
- STORAGE res = data_ptr[ indices[0] ] ;
- if( Rcpp::Vector<RTYPE>::is_na(res) ) return res;
-
- for( int i=1; i<n; i++){
- STORAGE current = data_ptr[indices[i]] ;
- if( Rcpp::Vector<RTYPE>::is_na(current) ) return current ;
- if( internal::is_smaller<RTYPE>( res, current ) ) res = current ;
- }
- return res ;
- }
-
- private:
- STORAGE* data_ptr ;
- bool is_summary ;
- } ;
-
-
-}
-
-#endif
diff --git a/inst/include/dplyr/Result/min.h b/inst/include/dplyr/Result/min.h
deleted file mode 100644
index 2ad6165..0000000
--- a/inst/include/dplyr/Result/min.h
+++ /dev/null
@@ -1,85 +0,0 @@
-#ifndef dplyr_Result_Min_H
-#define dplyr_Result_Min_H
-
-namespace dplyr {
-
- template <int RTYPE, bool NA_RM>
- class Min : public Processor<RTYPE, Min<RTYPE,NA_RM> > {
- public:
- typedef Processor<RTYPE, Min<RTYPE,NA_RM> > Base ;
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-
- Min(SEXP x, bool is_summary_ = false) :
- Base(x),
- data_ptr( Rcpp::internal::r_vector_start<RTYPE>(x) ),
- is_summary(is_summary_)
- {}
- ~Min(){}
-
- STORAGE process_chunk( const SlicingIndex& indices ){
- if( indices.size() == 0) return R_PosInf ;
- if( is_summary ) return data_ptr[ indices.group() ] ;
-
- int n = indices.size() ;
- // find the first non NA value
- STORAGE res = data_ptr[ indices[0] ] ;
- int i=1 ;
- while( i<n && Rcpp::Vector<RTYPE>::is_na(res) ){
- res = data_ptr[ indices[i++] ] ;
- }
-
- // we enter this loop if we did not scan the full vector
- if( i < n ) for( ; i<n; i++){
- STORAGE current = data_ptr[indices[i]] ;
- if( !Rcpp::Vector<RTYPE>::is_na(current) && internal::is_smaller<RTYPE>( current, res ) ) res = current ;
- }
-
- return res ;
- }
-
- private:
- STORAGE* data_ptr ;
- bool is_summary ;
- } ;
-
- // quit early version for NA_RM = false
- template <int RTYPE>
- class Min<RTYPE,false> : public Processor<RTYPE, Min<RTYPE,false> > {
- public:
- typedef Processor<RTYPE, Min<RTYPE,false> > Base ;
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-
- Min(SEXP x, bool is_summary_ = false) :
- Base(x),
- data_ptr( Rcpp::internal::r_vector_start<RTYPE>(x) ),
- is_summary(is_summary_)
- {}
- ~Min(){}
-
- STORAGE process_chunk( const SlicingIndex& indices ){
- if( indices.size() == 0) return R_PosInf ;
- if( is_summary ) return data_ptr[ indices.group() ] ;
-
- int n = indices.size() ;
-
- // find the first non NA value
- STORAGE res = data_ptr[ indices[0] ] ;
- if( Rcpp::Vector<RTYPE>::is_na(res) ) return res;
-
- for( int i=1; i<n; i++){
- STORAGE current = data_ptr[indices[i]] ;
- if( Rcpp::Vector<RTYPE>::is_na(current) ) return current ;
- if( internal::is_smaller<RTYPE>( current, res ) ) res = current ;
- }
- return res ;
- }
-
- private:
- STORAGE* data_ptr ;
- bool is_summary ;
- } ;
-
-
-}
-
-#endif
diff --git a/inst/include/dplyr/RowwiseDataFrame.h b/inst/include/dplyr/RowwiseDataFrame.h
index 0409101..8872e31 100644
--- a/inst/include/dplyr/RowwiseDataFrame.h
+++ b/inst/include/dplyr/RowwiseDataFrame.h
@@ -1,81 +1,99 @@
#ifndef dplyr_tools_RowwiseDataFrame_H
#define dplyr_tools_RowwiseDataFrame_H
-namespace Rcpp {
+#include <tools/SlicingIndex.h>
+
+#include <dplyr/Result/RowwiseSubset.h>
+#include <tools/SymbolString.h>
+
+namespace dplyr {
+
+class RowwiseDataFrame;
+
+class RowwiseDataFrameIndexIterator {
+public:
+ RowwiseDataFrameIndexIterator() : i(0) {}
- class RowwiseDataFrame ;
+ RowwiseDataFrameIndexIterator& operator++() {
+ ++i;
+ return *this;
+ }
- class RowwiseDataFrameIndexIterator {
- public:
- RowwiseDataFrameIndexIterator() : i(0){}
+ RowwiseSlicingIndex operator*() const {
+ return RowwiseSlicingIndex(i);
+ }
- RowwiseDataFrameIndexIterator& operator++() {
- ++i ;
- return *this ;
- }
+ int i;
+};
- SlicingIndex operator*() const {
- return SlicingIndex( IntegerVector::create(i), i) ;
- }
+class RowwiseDataFrame {
+public:
+ typedef RowwiseDataFrameIndexIterator group_iterator;
+ typedef RowwiseSlicingIndex slicing_index;
+ typedef RowwiseSubset subset;
- int i ;
- } ;
+ RowwiseDataFrame(SEXP x):
+ data_(x),
+ group_sizes()
+ {
+ group_sizes = rep(1, data_.nrows());
+ }
- class RowwiseDataFrame {
- public:
- typedef RowwiseDataFrameIndexIterator group_iterator ;
- RowwiseDataFrame( SEXP x):
- data_(x),
- group_sizes()
- {
- group_sizes = rep(1, data_.nrows()) ;
- }
+ group_iterator group_begin() const {
+ return RowwiseDataFrameIndexIterator();
+ }
- group_iterator group_begin() const {
- return RowwiseDataFrameIndexIterator() ;
- }
+ DataFrame& data() {
+ return data_;
+ }
+ const DataFrame& data() const {
+ return data_;
+ }
- DataFrame& data() {
- return data_ ;
- }
- const DataFrame& data() const {
- return data_ ;
- }
+ inline int ngroups() const {
+ return group_sizes.size();
+ }
- inline int ngroups() const {
- return group_sizes.size() ;
- }
+ inline int nvars() const {
+ return 0;
+ }
- inline int nvars() const {
- return 0 ;
- }
+ inline SymbolString symbol(int) {
+ stop("Rowwise data frames don't have grouping variables");
+ }
- inline SEXP symbol(int i){ return R_NilValue ; }
- inline SEXP label(int i){ return R_NilValue ; }
+ inline SEXP label(int) {
+ return R_NilValue;
+ }
- inline int nrows() const {
- return data_.nrows() ;
- }
+ inline int nrows() const {
+ return data_.nrows();
+ }
- inline int max_group_size() const{
- return 1 ;
- }
+ inline int max_group_size() const {
+ return 1;
+ }
- inline const IntegerVector& get_group_sizes() const {
- return group_sizes ;
- }
+ inline subset* create_subset(SEXP x) const {
+ return rowwise_subset(x);
+ }
- private:
+private:
- DataFrame data_ ;
- IntegerVector group_sizes ;
+ DataFrame data_;
+ IntegerVector group_sizes;
- } ;
+};
- template <>
- inline bool is<RowwiseDataFrame>( SEXP x){
- return Rf_inherits(x, "rowwise_df") ;
- }
+}
+
+namespace Rcpp {
+using namespace dplyr;
+
+template <>
+inline bool is<RowwiseDataFrame>(SEXP x) {
+ return Rf_inherits(x, "rowwise_df");
+}
}
diff --git a/inst/include/dplyr/SubsetVectorVisitor.h b/inst/include/dplyr/SubsetVectorVisitor.h
index 0cf429e..78be3b9 100644
--- a/inst/include/dplyr/SubsetVectorVisitor.h
+++ b/inst/include/dplyr/SubsetVectorVisitor.h
@@ -1,43 +1,55 @@
#ifndef dplyr_SubsetVectorVisitor_H
#define dplyr_SubsetVectorVisitor_H
+#include <tools/SlicingIndex.h>
+#include <dplyr/DataFrameVisitorsIndexMap.h>
+#include <dplyr/EmptySubset.h>
+
namespace dplyr {
- /**
- * Subset Vector visitor base class, defines the interface
- */
- class SubsetVectorVisitor {
- public:
- virtual ~SubsetVectorVisitor(){}
+template <typename Container>
+inline int output_size(const Container& container) {
+ return container.size();
+}
+
+/**
+ * Subset Vector visitor base class, defines the interface
+ */
+class SubsetVectorVisitor {
+public:
+ virtual ~SubsetVectorVisitor() {}
- /** creates a new vector, of the same type as the visited vector, by
- * copying elements at the given indices
- */
- virtual SEXP subset( const Rcpp::IntegerVector& index ) const = 0 ;
+ /** creates a new vector, of the same type as the visited vector, by
+ * copying elements at the given indices
+ */
+ virtual SEXP subset(const Rcpp::IntegerVector& index) const = 0;
- virtual SEXP subset( const std::vector<int>& ) const = 0 ;
+ virtual SEXP subset(const std::vector<int>&) const = 0;
- virtual SEXP subset( const SlicingIndex& ) const = 0 ;
+ virtual SEXP subset(const SlicingIndex&) const = 0;
- /** creates a new vector, of the same type as the visited vector, by
- * copying elements at the given indices
- */
- virtual SEXP subset( const ChunkIndexMap& index ) const = 0 ;
+ /** creates a new vector, of the same type as the visited vector, by
+ * copying elements at the given indices
+ */
+ virtual SEXP subset(const ChunkIndexMap& index) const = 0;
- virtual SEXP subset( const Rcpp::LogicalVector& index ) const = 0 ;
+ virtual SEXP subset(EmptySubset) const = 0;
- virtual SEXP subset( EmptySubset ) const = 0 ;
+ virtual int size() const = 0;
- virtual int size() const = 0 ;
+ virtual std::string get_r_type() const = 0;
- virtual std::string get_r_type() const = 0 ;
+ bool is_same_typeid(SubsetVectorVisitor* other) const {
+ return typeid(*other) == typeid(*this);
+ }
- virtual bool is_compatible( SubsetVectorVisitor* other, std::stringstream&, const std::string& ) const = 0 ;
+ virtual bool is_same_type(SubsetVectorVisitor* other, std::stringstream&, const SymbolString&) const {
+ return is_same_typeid(other);
+ }
- } ;
+ virtual bool is_compatible(SubsetVectorVisitor* other, std::stringstream&, const SymbolString&) const = 0;
- // defined in visitor.h
- inline SubsetVectorVisitor* subset_visitor( SEXP ) ;
+};
} // namespace dplyr
diff --git a/inst/include/dplyr/SubsetVectorVisitorImpl.h b/inst/include/dplyr/SubsetVectorVisitorImpl.h
index 2b27868..e6d8149 100644
--- a/inst/include/dplyr/SubsetVectorVisitorImpl.h
+++ b/inst/include/dplyr/SubsetVectorVisitorImpl.h
@@ -1,248 +1,234 @@
#ifndef dplyr_SubsetVectorVisitor_Impl_H
#define dplyr_SubsetVectorVisitor_Impl_H
-namespace dplyr {
-
- /**
- * Implementations
- */
- template <int RTYPE>
- class SubsetVectorVisitorImpl : public SubsetVectorVisitor {
- public:
- typedef Rcpp::Vector<RTYPE> VECTOR ;
-
- /**
- * The type of data : int, double, SEXP, Rcomplex
- */
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-
- SubsetVectorVisitorImpl( const VECTOR& vec_ ) : vec(vec_) {}
-
- inline SEXP subset( const Rcpp::IntegerVector& index) const {
- return subset_int_index( index) ;
- }
-
- inline SEXP subset( const std::vector<int>& index) const {
- return subset_int_index( index) ;
- }
-
- inline SEXP subset( const SlicingIndex& index) const {
- return subset_int_index( index) ;
- }
-
- inline SEXP subset( const ChunkIndexMap& map ) const {
- int n = output_size(map) ;
- VECTOR out = Rcpp::no_init(n) ;
- ChunkIndexMap::const_iterator it = map.begin();
- for( int i=0; i<n; i++, ++it)
- out[i] = vec[ it->first ] ;
- copy_most_attributes(out, vec) ;
- return out ;
- }
-
- inline SEXP subset( const Rcpp::LogicalVector& index ) const {
- int n = output_size(index) ;
- VECTOR out = Rcpp::no_init(n) ;
- for( int i=0, k=0; k<n; k++, i++ ) {
- while( index[i] != TRUE ) i++;
- out[k] = vec[i] ;
- }
- copy_most_attributes(out, vec) ;
- return out ;
- }
-
- inline SEXP subset( EmptySubset ) const {
- VECTOR out(0) ;
- copy_most_attributes(out, vec) ;
- return out ;
- }
-
- inline std::string get_r_type() const {
- return VectorVisitorType<RTYPE>() ;
- }
-
- inline int size() const {
- return vec.size() ;
- }
-
- inline bool is_compatible( SubsetVectorVisitor* other, std::stringstream&, const std::string& ) const {
- return true ;
- }
-
- protected:
- VECTOR vec ;
-
- template <typename Container>
- inline SEXP subset_int_index( const Container& index ) const {
- int n = output_size(index) ;
- VECTOR out = Rcpp::no_init(n) ;
- for( int i=0; i<n; i++){
- if( index[i] < 0 ){
- out[i] = VECTOR::get_na() ;
- } else {
- out[i] = vec[ index[i] ] ;
- }
- }
- copy_most_attributes(out, vec) ;
- return out ;
- }
-
- } ;
-
- template <>
- template <typename Container>
- SEXP SubsetVectorVisitorImpl<VECSXP>::subset_int_index( const Container& index ) const {
- int n = output_size(index) ;
- List out(n) ;
- for( int i=0; i<n; i++)
- out[i] = (index[i] < 0) ? R_NilValue : vec[ index[i] ] ;
- copy_most_attributes(out, vec) ;
- return out ;
- }
-
- class SubsetFactorVisitor : public SubsetVectorVisitorImpl<INTSXP> {
- public:
- typedef SubsetVectorVisitorImpl<INTSXP> Parent ;
+#include <tools/collapse.h>
+#include <tools/utils.h>
- SubsetFactorVisitor( const IntegerVector& vec_ ) : Parent(vec_){
- levels = vec.attr( "levels" ) ;
- levels_ptr = Rcpp::internal::r_vector_start<STRSXP>(levels) ;
- }
+#include <dplyr/VectorVisitorImpl.h>
+#include <dplyr/SubsetVectorVisitor.h>
- inline SEXP subset( const Rcpp::IntegerVector& index) const {
- return promote( Parent::subset( index ) );
- }
+namespace dplyr {
- inline SEXP subset( const SlicingIndex& index) const {
- return promote( Parent::subset( index ) );
- }
+/**
+ * Implementations
+ */
+template <int RTYPE>
+class SubsetVectorVisitorImpl : public SubsetVectorVisitor {
+public:
+ typedef Rcpp::Vector<RTYPE> VECTOR;
+
+ /**
+ * The type of data : int, double, SEXP, Rcomplex
+ */
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+
+ SubsetVectorVisitorImpl(const VECTOR& vec_) : vec(vec_) {}
+
+ inline SEXP subset(const Rcpp::IntegerVector& index) const {
+ return subset_int_index(index);
+ }
+
+ inline SEXP subset(const std::vector<int>& index) const {
+ return subset_int_index(index);
+ }
+
+ inline SEXP subset(const SlicingIndex& index) const {
+ return subset_int_index(index);
+ }
+
+ inline SEXP subset(const ChunkIndexMap& map) const {
+ int n = output_size(map);
+ VECTOR out = Rcpp::no_init(n);
+ ChunkIndexMap::const_iterator it = map.begin();
+ for (int i = 0; i < n; i++, ++it)
+ out[i] = vec[ it->first ];
+ copy_most_attributes(out, vec);
+ return out;
+ }
+
+ inline SEXP subset(EmptySubset) const {
+ VECTOR out(0);
+ copy_most_attributes(out, vec);
+ return out;
+ }
+
+ inline std::string get_r_type() const {
+ return VectorVisitorType<RTYPE>();
+ }
+
+ inline int size() const {
+ return vec.size();
+ }
+
+ inline bool is_compatible(SubsetVectorVisitor* other, std::stringstream&, const SymbolString&) const {
+ return is_same_typeid(other);
+ }
+
+protected:
+ VECTOR vec;
+
+ template <typename Container>
+ inline SEXP subset_int_index(const Container& index) const {
+ int n = output_size(index);
+ VECTOR out = Rcpp::no_init(n);
+ for (int i = 0; i < n; i++) {
+ if (index[i] < 0) {
+ out[i] = VECTOR::get_na();
+ } else {
+ out[i] = vec[ index[i] ];
+ }
+ }
+ copy_most_attributes(out, vec);
+ return out;
+ }
+
+};
+
+template <>
+template <typename Container>
+SEXP SubsetVectorVisitorImpl<VECSXP>::subset_int_index(const Container& index) const {
+ int n = output_size(index);
+ List out(n);
+ for (int i = 0; i < n; i++)
+ out[i] = (index[i] < 0) ? R_NilValue : vec[ index[i] ];
+ copy_most_attributes(out, vec);
+ return out;
+}
- inline SEXP subset( const std::vector<int>& index) const {
- return promote( Parent::subset( index ) ) ;
- }
+class SubsetFactorVisitor : public SubsetVectorVisitorImpl<INTSXP> {
+public:
+ typedef SubsetVectorVisitorImpl<INTSXP> Parent;
- inline SEXP subset( const ChunkIndexMap& map ) const {
- return promote( Parent::subset( map ) ) ;
- }
+ SubsetFactorVisitor(const IntegerVector& vec_) : Parent(vec_) {
+ levels = get_levels(vec);
+ levels_ptr = Rcpp::internal::r_vector_start<STRSXP>(levels);
+ }
- inline SEXP subset( const Rcpp::LogicalVector& index ) const {
- return promote( Parent::subset( index ) ) ;
- }
+ inline SEXP subset(const Rcpp::IntegerVector& index) const {
+ return promote(Parent::subset(index));
+ }
- inline SEXP subset( EmptySubset empty) const {
- return promote( Parent::subset(empty) ) ;
- }
+ inline SEXP subset(const SlicingIndex& index) const {
+ return promote(Parent::subset(index));
+ }
- inline std::string get_r_type() const {
- CharacterVector classes = Parent::vec.attr( "class" ) ;
- return collapse(classes) ;
- }
+ inline SEXP subset(const std::vector<int>& index) const {
+ return promote(Parent::subset(index));
+ }
- inline bool is_compatible( SubsetVectorVisitor* other, std::stringstream& ss, const std::string& name ) const {
- if( typeid(*other) == typeid(*this) )
- return compatible( dynamic_cast<SubsetFactorVisitor*>(other), ss, name ) ;
+ inline SEXP subset(const ChunkIndexMap& map) const {
+ return promote(Parent::subset(map));
+ }
- if( typeid(*other) == typeid(SubsetVectorVisitorImpl<STRSXP>) )
- return true ;
+ inline SEXP subset(EmptySubset empty) const {
+ return promote(Parent::subset(empty));
+ }
- return false ;
- }
+ inline std::string get_r_type() const {
+ return get_single_class(Parent::vec);
+ }
- private:
+ inline bool is_same_type(SubsetVectorVisitor* other, std::stringstream& ss, const SymbolString& name) const {
+ return is_same_typeid(other) && same_levels(dynamic_cast<SubsetFactorVisitor*>(other), ss, name);
+ }
- inline bool compatible(SubsetFactorVisitor* other, std::stringstream& ss, const std::string& name ) const {
- CharacterVector levels_other = other->levels ;
- if( setdiff( levels, levels_other ).size() ){
- ss << "Factor levels not equal for column " << name ;
- return false ;
- }
- return true;
- }
+ inline bool is_compatible(SubsetVectorVisitor* other, std::stringstream&, const SymbolString&) const {
+ return is_same_typeid(other) || (typeid(*other) == typeid(SubsetVectorVisitorImpl<STRSXP>));
+ }
+private:
+ inline bool same_levels(SubsetFactorVisitor* other, std::stringstream& ss, const SymbolString& name) const {
+ CharacterVector levels_other = other->levels;
- inline SEXP promote( IntegerVector x) const {
- copy_most_attributes(x, vec ) ;
- return x ;
- }
+ if (!character_vector_equal(levels, levels_other)) {
+ ss << "Factor levels not equal for column `" << name.get_utf8_cstring() << "`";
+ return false;
+ }
+ return true;
+ }
- CharacterVector levels ;
- SEXP* levels_ptr ;
+ inline SEXP promote(IntegerVector x) const {
+ copy_most_attributes(x, vec);
+ return x;
+ }
- } ;
+ CharacterVector levels;
+ SEXP* levels_ptr;
- class DateSubsetVectorVisitor : public SubsetVectorVisitor {
- public:
+};
- DateSubsetVectorVisitor( SEXP data ) : impl(0){
- if( TYPEOF(data) == INTSXP ) {
- impl = new SubsetVectorVisitorImpl<INTSXP>(data) ;
- } else if( TYPEOF(data) == REALSXP ) {
- impl = new SubsetVectorVisitorImpl<REALSXP>(data) ;
- } else {
- stop( "" ) ;
- }
- }
+class DateSubsetVectorVisitor : public SubsetVectorVisitor {
+public:
- ~DateSubsetVectorVisitor( ){
- delete impl ;
- }
+ DateSubsetVectorVisitor(SEXP data) : impl(0) {
+ if (TYPEOF(data) == INTSXP) {
+ impl = new SubsetVectorVisitorImpl<INTSXP>(data);
+ } else if (TYPEOF(data) == REALSXP) {
+ impl = new SubsetVectorVisitorImpl<REALSXP>(data);
+ } else {
+ stop("Unreachable");
+ }
+ }
- virtual SEXP subset( const Rcpp::IntegerVector& index ) const {
- return impl->subset( index ) ;
- }
+ ~DateSubsetVectorVisitor() {
+ delete impl;
+ }
- virtual SEXP subset( const SlicingIndex& index ) const {
- return impl->subset( index ) ;
- }
+ virtual SEXP subset(const Rcpp::IntegerVector& index) const {
+ return impl->subset(index);
+ }
- virtual SEXP subset( const std::vector<int>& index ) const {
- return impl->subset( index ) ;
- }
+ virtual SEXP subset(const SlicingIndex& index) const {
+ return impl->subset(index);
+ }
- virtual SEXP subset( const ChunkIndexMap& index ) const {
- return impl->subset(index) ;
- }
+ virtual SEXP subset(const std::vector<int>& index) const {
+ return impl->subset(index);
+ }
- virtual SEXP subset( const Rcpp::LogicalVector& index ) const {
- return impl->subset( index ) ;
- }
+ virtual SEXP subset(const ChunkIndexMap& index) const {
+ return impl->subset(index);
+ }
- virtual SEXP subset( EmptySubset index ) const {
- return impl->subset( index ) ;
- }
+ virtual SEXP subset(EmptySubset index) const {
+ return impl->subset(index);
+ }
- virtual int size() const {
- return impl->size() ;
- }
+ virtual int size() const {
+ return impl->size();
+ }
- virtual std::string get_r_type() const {
- return impl->get_r_type() ;
- }
+ virtual std::string get_r_type() const {
+ return impl->get_r_type();
+ }
- bool is_compatible( SubsetVectorVisitor* other, std::stringstream&, const std::string& ) const {
- return typeid(*other) == typeid(*this) ;
- }
+ bool is_compatible(SubsetVectorVisitor* other, std::stringstream&, const SymbolString&) const {
+ return is_same_typeid(other);
+ }
- private:
- SubsetVectorVisitor* impl ;
- DateSubsetVectorVisitor( const DateSubsetVectorVisitor& ) ;
+private:
+ SubsetVectorVisitor* impl;
+ DateSubsetVectorVisitor(const DateSubsetVectorVisitor&);
- } ;
+};
- template <>
- inline bool SubsetVectorVisitorImpl<INTSXP>::is_compatible( SubsetVectorVisitor* other, std::stringstream&, const std::string& ) const {
- return typeid(*other) == typeid(*this) || typeid(*other) == typeid(SubsetVectorVisitorImpl<REALSXP>) ;
- }
+template <>
+inline bool SubsetVectorVisitorImpl<INTSXP>::is_compatible(SubsetVectorVisitor* other, std::stringstream&,
+ const SymbolString&) const {
+ return is_same_typeid(other) || typeid(*other) == typeid(SubsetVectorVisitorImpl<REALSXP>);
+}
- template <>
- inline bool SubsetVectorVisitorImpl<REALSXP>::is_compatible( SubsetVectorVisitor* other, std::stringstream&, const std::string& ) const {
- return typeid(*other) == typeid(*this) || typeid(*other) == typeid(SubsetVectorVisitorImpl<INTSXP>) ;
- }
+template <>
+inline bool SubsetVectorVisitorImpl<REALSXP>::is_compatible(SubsetVectorVisitor* other, std::stringstream&,
+ const SymbolString&) const {
+ return is_same_typeid(other) || typeid(*other) == typeid(SubsetVectorVisitorImpl<INTSXP>);
+}
- template <>
- inline bool SubsetVectorVisitorImpl<STRSXP>::is_compatible( SubsetVectorVisitor* other, std::stringstream&, const std::string& ) const {
- return typeid(*other) == typeid(*this) || typeid(*other) == typeid(SubsetFactorVisitor) ;
- }
+template <>
+inline bool SubsetVectorVisitorImpl<STRSXP>::is_compatible(SubsetVectorVisitor* other, std::stringstream&,
+ const SymbolString&) const {
+ return is_same_typeid(other) || typeid(*other) == typeid(SubsetFactorVisitor);
+}
}
diff --git a/inst/include/dplyr/SummarisedVariable.h b/inst/include/dplyr/SummarisedVariable.h
index 347b87e..35908d3 100644
--- a/inst/include/dplyr/SummarisedVariable.h
+++ b/inst/include/dplyr/SummarisedVariable.h
@@ -3,16 +3,16 @@
namespace dplyr {
- class SummarisedVariable {
- public:
- SummarisedVariable(SEXP x) : data(x){}
+class SummarisedVariable {
+public:
+ SummarisedVariable(SEXP x) : data(x) {}
- inline operator SEXP() const{
- return data;
- }
- private:
- SEXP data ;
- } ;
+ inline operator SEXP() const {
+ return data;
+ }
+private:
+ SEXP data;
+};
}
diff --git a/inst/include/dplyr/VectorVisitor.h b/inst/include/dplyr/VectorVisitor.h
index 1e33038..7e1f1e4 100644
--- a/inst/include/dplyr/VectorVisitor.h
+++ b/inst/include/dplyr/VectorVisitor.h
@@ -3,41 +3,34 @@
namespace dplyr {
- /**
- * Vector visitor base class, defines the interface
- */
- class VectorVisitor {
- public:
- virtual ~VectorVisitor(){}
+/**
+ * Vector visitor base class, defines the interface
+ */
+class VectorVisitor {
+public:
+ virtual ~VectorVisitor() {}
- /** hash the element of the visited vector at index i */
- virtual size_t hash(int i) const = 0 ;
+ /** hash the element of the visited vector at index i */
+ virtual size_t hash(int i) const = 0;
- /** are the elements at indices i and j equal */
- virtual bool equal(int i, int j) const = 0 ;
+ /** are the elements at indices i and j equal */
+ virtual bool equal(int i, int j) const = 0;
- /** are the elements at indices i and j equal or both NA */
- virtual bool equal_or_both_na(int i, int j) const = 0 ;
+ /** are the elements at indices i and j equal or both NA */
+ virtual bool equal_or_both_na(int i, int j) const = 0;
- /** is the i element less than the j element */
- virtual bool less( int i, int j) const = 0 ;
+ /** is the i element less than the j element */
+ virtual bool less(int i, int j) const = 0;
- /** is the i element less than the j element */
- virtual bool greater( int i, int j) const = 0 ;
+ /** is the i element less than the j element */
+ virtual bool greater(int i, int j) const = 0;
- virtual int size() const = 0 ;
+ virtual int size() const = 0;
- virtual std::string get_r_type() const = 0 ;
+ virtual std::string get_r_type() const = 0;
- virtual bool is_compatible( VectorVisitor* other, std::stringstream&, const std::string& ) const {
- return true ;
- }
-
- virtual bool is_na( int i ) const = 0 ;
- } ;
-
- // defined in visitor.h
- inline VectorVisitor* visitor( SEXP ) ;
+ virtual bool is_na(int i) const = 0;
+};
} // namespace dplyr
diff --git a/inst/include/dplyr/VectorVisitorImpl.h b/inst/include/dplyr/VectorVisitorImpl.h
index f262fba..f8b763b 100644
--- a/inst/include/dplyr/VectorVisitorImpl.h
+++ b/inst/include/dplyr/VectorVisitorImpl.h
@@ -1,185 +1,199 @@
#ifndef dplyr_VectorVisitor_Impl_H
#define dplyr_VectorVisitor_Impl_H
+#include <tools/collapse.h>
+#include <tools/utils.h>
+
+#include <dplyr/CharacterVectorOrderer.h>
+#include <dplyr/comparisons.h>
+#include <dplyr/VectorVisitor.h>
+#include <tools/encoding.h>
+
namespace dplyr {
- template <typename Container>
- inline int output_size( const Container& container){
- return container.size() ;
- }
-
- template <>
- inline int output_size<LogicalVector>( const LogicalVector& container){
- return std::count( container.begin(), container.end(), TRUE ) ;
- }
-
- template <int RTYPE> std::string VectorVisitorType() ;
- template <> inline std::string VectorVisitorType<INTSXP>() { return "integer" ; }
- template <> inline std::string VectorVisitorType<REALSXP>() { return "numeric" ; }
- template <> inline std::string VectorVisitorType<LGLSXP>() { return "logical" ; }
- template <> inline std::string VectorVisitorType<STRSXP>() { return "character" ; }
- template <> inline std::string VectorVisitorType<CPLXSXP>() { return "complex" ; }
- template <> inline std::string VectorVisitorType<VECSXP>() { return "list" ; }
-
- /**
- * Implementations
- */
- template <int RTYPE>
- class VectorVisitorImpl : public VectorVisitor, public comparisons<RTYPE> {
- public:
- typedef comparisons<RTYPE> compare ;
- typedef Rcpp::Vector<RTYPE> VECTOR ;
-
- /**
- * The type of data : int, double, SEXP, Rcomplex
- */
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-
- /**
- * Hasher for that type of data
- */
- typedef boost::hash<STORAGE> hasher ;
-
- VectorVisitorImpl( const VECTOR& vec_ ) : vec(vec_) {}
-
- /**
- * implementations
- */
- size_t hash(int i) const {
- return hash_fun( vec[i] ) ;
- }
- inline bool equal(int i, int j) const {
- return compare::equal_or_both_na( vec[i], vec[j] ) ;
- }
-
- inline bool less(int i, int j) const {
- return compare::is_less( vec[i], vec[j] ) ;
- }
-
- inline bool equal_or_both_na(int i, int j) const {
- return compare::equal_or_both_na( vec[i], vec[j] ) ;
- }
-
- inline bool greater(int i, int j) const {
- return compare::is_greater( vec[i], vec[j] ) ;
- }
-
- inline std::string get_r_type() const {
- return VectorVisitorType<RTYPE>() ;
- }
-
- int size() const {
- return vec.size() ;
- }
-
- bool is_na( int i ) const {
- return VECTOR::is_na( vec[i] ) ;
- }
-
- protected:
- VECTOR vec ;
- hasher hash_fun ;
-
- } ;
-
- class FactorVisitor : public VectorVisitorImpl<INTSXP> {
- public:
- typedef VectorVisitorImpl<INTSXP> Parent ;
-
- FactorVisitor( const IntegerVector& vec_ ) : Parent(vec_){
- levels = vec.attr( "levels" ) ;
- levels_ptr = Rcpp::internal::r_vector_start<STRSXP>(levels) ;
- }
-
- inline bool equal(int i, int j) const {
- return vec[i] == vec[j] ;
- }
-
- inline bool less(int i, int j) const {
- return string_compare.is_less(
- vec[i] < 0 ? NA_STRING : levels_ptr[vec[i]],
- vec[j] < 0 ? NA_STRING : levels_ptr[vec[j]]
- ) ;
- }
-
- inline bool greater(int i, int j) const {
- return string_compare.is_greater(
- vec[i] < 0 ? NA_STRING : levels_ptr[vec[i]],
- vec[j] < 0 ? NA_STRING : levels_ptr[vec[j]]
- ) ;
- }
-
- inline std::string get_r_type() const {
- CharacterVector classes = Parent::vec.attr( "class" ) ;
- return collapse(classes) ;
- }
-
- bool is_compatible( VectorVisitor* other, std::stringstream& ss, const std::string& name ) const {
- return compatible( dynamic_cast<FactorVisitor*>(other), ss, name ) ;
- }
-
- private:
-
- inline bool compatible(FactorVisitor* other, std::stringstream& ss, const std::string& name ) const {
- CharacterVector levels_other = other->levels ;
- if( setdiff( levels, levels_other ).size() ){
- ss << "Factor levels not equal for column " << name ;
- return false ;
- }
- return true;
- }
-
- CharacterVector levels ;
- SEXP* levels_ptr ;
- comparisons<STRSXP> string_compare ;
- } ;
-
-
- template <>
- class VectorVisitorImpl<STRSXP> : public VectorVisitor {
- public:
-
- VectorVisitorImpl( const CharacterVector& vec_ ) :
- vec(vec_),
- orders( CharacterVectorOrderer(vec).get() )
- {}
-
- size_t hash(int i) const {
- return orders[i] ;
- }
- inline bool equal(int i, int j) const {
- return orders[i] == orders[j] ;
- }
-
- inline bool less(int i, int j) const {
- return orders[i] < orders[j] ;
- }
-
- inline bool equal_or_both_na(int i, int j) const {
- return orders[i] == orders[j] ;
- }
-
- inline bool greater(int i, int j) const {
- return orders[i] > orders[j] ;
- }
-
- inline std::string get_r_type() const {
- return VectorVisitorType<STRSXP>() ;
- }
-
- int size() const {
- return vec.size() ;
- }
-
- bool is_na( int i ) const {
- return CharacterVector::is_na(vec[i]) ;
- }
-
- protected:
- CharacterVector vec ;
- IntegerVector orders ;
-
- } ;
+template <int RTYPE> std::string VectorVisitorType();
+template <> inline std::string VectorVisitorType<INTSXP>() {
+ return "integer";
+}
+template <> inline std::string VectorVisitorType<REALSXP>() {
+ return "numeric";
+}
+template <> inline std::string VectorVisitorType<LGLSXP>() {
+ return "logical";
+}
+template <> inline std::string VectorVisitorType<STRSXP>() {
+ return "character";
+}
+template <> inline std::string VectorVisitorType<CPLXSXP>() {
+ return "complex";
+}
+template <> inline std::string VectorVisitorType<VECSXP>() {
+ return "list";
+}
+
+/**
+ * Implementations
+ */
+template <int RTYPE>
+class VectorVisitorImpl : public VectorVisitor {
+ typedef comparisons<RTYPE> compare;
+
+public:
+ typedef Rcpp::Vector<RTYPE> VECTOR;
+
+ /**
+ * The type of data : int, double, SEXP, Rcomplex
+ */
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+
+ /**
+ * Hasher for that type of data
+ */
+ typedef boost::hash<STORAGE> hasher;
+
+ VectorVisitorImpl(const VECTOR& vec_) : vec(vec_) {}
+
+ /**
+ * implementations
+ */
+ size_t hash(int i) const {
+ return hash_fun(vec[i]);
+ }
+ inline bool equal(int i, int j) const {
+ return compare::equal_or_both_na(vec[i], vec[j]);
+ }
+
+ inline bool less(int i, int j) const {
+ return compare::is_less(vec[i], vec[j]);
+ }
+
+ inline bool equal_or_both_na(int i, int j) const {
+ return compare::equal_or_both_na(vec[i], vec[j]);
+ }
+
+ inline bool greater(int i, int j) const {
+ return compare::is_greater(vec[i], vec[j]);
+ }
+
+ inline std::string get_r_type() const {
+ return VectorVisitorType<RTYPE>();
+ }
+
+ int size() const {
+ return vec.size();
+ }
+
+ bool is_na(int i) const {
+ return VECTOR::is_na(vec[i]);
+ }
+
+protected:
+ VECTOR vec;
+ hasher hash_fun;
+
+};
+
+class FactorVisitor : public VectorVisitorImpl<INTSXP> {
+ typedef comparisons<STRSXP> string_compare;
+
+public:
+ typedef VectorVisitorImpl<INTSXP> Parent;
+
+ FactorVisitor(const IntegerVector& vec_) : Parent(vec_) {
+ levels = get_levels(vec);
+ levels_ptr = Rcpp::internal::r_vector_start<STRSXP>(levels);
+ }
+
+ inline bool equal(int i, int j) const {
+ return vec[i] == vec[j];
+ }
+
+ inline bool less(int i, int j) const {
+ return
+ string_compare::is_less(
+ vec[i] < 0 ? NA_STRING : levels_ptr[vec[i]],
+ vec[j] < 0 ? NA_STRING : levels_ptr[vec[j]]
+ );
+ }
+
+ inline bool greater(int i, int j) const {
+ return
+ string_compare::is_greater(
+ vec[i] < 0 ? NA_STRING : levels_ptr[vec[i]],
+ vec[j] < 0 ? NA_STRING : levels_ptr[vec[j]]
+ );
+ }
+
+ inline std::string get_r_type() const {
+ return get_single_class(Parent::vec);
+ }
+
+private:
+ CharacterVector levels;
+ SEXP* levels_ptr;
+};
+
+
+template <>
+class VectorVisitorImpl<STRSXP> : public VectorVisitor {
+public:
+
+ VectorVisitorImpl(const CharacterVector& vec_) :
+ vec(reencode_char(vec_)), has_orders(false)
+ {}
+
+ size_t hash(int i) const {
+ return reinterpret_cast<size_t>(get_item(i));
+ }
+ inline bool equal(int i, int j) const {
+ return equal_or_both_na(i, j);
+ }
+
+ inline bool less(int i, int j) const {
+ provide_orders();
+ return orders[i] < orders[j];
+ }
+
+ inline bool equal_or_both_na(int i, int j) const {
+ return get_item(i) == get_item(j);
+ }
+
+ inline bool greater(int i, int j) const {
+ provide_orders();
+ return orders[i] > orders[j];
+ }
+
+ inline std::string get_r_type() const {
+ return VectorVisitorType<STRSXP>();
+ }
+
+ int size() const {
+ return vec.size();
+ }
+
+ bool is_na(int i) const {
+ return CharacterVector::is_na(vec[i]);
+ }
+
+private:
+ SEXP get_item(const int i) const {
+ return static_cast<SEXP>(vec[i]);
+ }
+
+ void provide_orders() const {
+ if (has_orders)
+ return;
+
+ orders = CharacterVectorOrderer(vec).get();
+ has_orders = true;
+ }
+
+private:
+ CharacterVector vec;
+ mutable IntegerVector orders;
+ mutable bool has_orders;
+
+};
}
diff --git a/inst/include/dplyr/bad.h b/inst/include/dplyr/bad.h
new file mode 100644
index 0000000..2c04dba
--- /dev/null
+++ b/inst/include/dplyr/bad.h
@@ -0,0 +1,89 @@
+#ifndef DPLYR_DPLYR_BAD_H
+#define DPLYR_DPLYR_BAD_H
+
+namespace dplyr {
+
+template<class C1>
+void NORET bad_arg(const SymbolString& arg, C1 arg1) {
+ static Function bad_fun = Function("bad_args", Environment::namespace_env("dplyr"));
+ static Function identity = Function("identity", Environment::base_env());
+ String message = bad_fun(CharacterVector::create(arg.get_string()), arg1, _[".abort"] = identity);
+ message.set_encoding(CE_UTF8);
+ stop(message.get_cstring());
+}
+
+template<class C1, class C2>
+void NORET bad_arg(const SymbolString& arg, C1 arg1, C2 arg2) {
+ static Function bad_fun = Function("bad_args", Environment::namespace_env("dplyr"));
+ static Function identity = Function("identity", Environment::base_env());
+ String message = bad_fun(CharacterVector::create(arg.get_string()), arg1, arg2, _[".abort"] = identity);
+ message.set_encoding(CE_UTF8);
+ stop(message.get_cstring());
+}
+
+template<class C1, class C2, class C3>
+void NORET bad_arg(const SymbolString& arg, C1 arg1, C2 arg2, C3 arg3) {
+ static Function bad_fun = Function("bad_args", Environment::namespace_env("dplyr"));
+ static Function identity = Function("identity", Environment::base_env());
+ String message = bad_fun(CharacterVector::create(arg.get_string()), arg1, arg2, arg3, _[".abort"] = identity);
+ message.set_encoding(CE_UTF8);
+ stop(message.get_cstring());
+}
+
+template<class C1>
+void NORET bad_pos_arg(int pos_arg, C1 arg1) {
+ static Function bad_fun = Function("bad_pos_args", Environment::namespace_env("dplyr"));
+ static Function identity = Function("identity", Environment::base_env());
+ String message = bad_fun(pos_arg, arg1, _[".abort"] = identity);
+ message.set_encoding(CE_UTF8);
+ stop(message.get_cstring());
+}
+
+template<class C1, class C2>
+void NORET bad_pos_arg(int pos_arg, C1 arg1, C2 arg2) {
+ static Function bad_fun = Function("bad_pos_args", Environment::namespace_env("dplyr"));
+ static Function identity = Function("identity", Environment::base_env());
+ String message = bad_fun(pos_arg, arg1, arg2, _[".abort"] = identity);
+ message.set_encoding(CE_UTF8);
+ stop(message.get_cstring());
+}
+
+template<class C1, class C2, class C3>
+void NORET bad_pos_arg(int pos_arg, C1 arg1, C2 arg2, C3 arg3) {
+ static Function bad_fun = Function("bad_pos_args", Environment::namespace_env("dplyr"));
+ static Function identity = Function("identity", Environment::base_env());
+ String message = bad_fun(pos_arg, arg1, arg2, arg3, _[".abort"] = identity);
+ message.set_encoding(CE_UTF8);
+ stop(message.get_cstring());
+}
+
+template<class C1>
+void NORET bad_col(const SymbolString& col, C1 arg1) {
+ static Function bad_fun = Function("bad_cols", Environment::namespace_env("dplyr"));
+ static Function identity = Function("identity", Environment::base_env());
+ String message = bad_fun(CharacterVector::create(col.get_string()), arg1, _[".abort"] = identity);
+ message.set_encoding(CE_UTF8);
+ stop(message.get_cstring());
+}
+
+template<class C1, class C2>
+void NORET bad_col(const SymbolString& col, C1 arg1, C2 arg2) {
+ static Function bad_fun = Function("bad_cols", Environment::namespace_env("dplyr"));
+ static Function identity = Function("identity", Environment::base_env());
+ String message = bad_fun(CharacterVector::create(col.get_string()), arg1, arg2, _[".abort"] = identity);
+ message.set_encoding(CE_UTF8);
+ stop(message.get_cstring());
+}
+
+template<class C1, class C2, class C3>
+void NORET bad_col(const SymbolString& col, C1 arg1, C2 arg2, C3 arg3) {
+ static Function bad_fun = Function("bad_cols", Environment::namespace_env("dplyr"));
+ static Function identity = Function("identity", Environment::base_env());
+ String message = bad_fun(CharacterVector::create(col.get_string()), arg1, arg2, arg3, _[".abort"] = identity);
+ message.set_encoding(CE_UTF8);
+ stop(message.get_cstring());
+}
+
+}
+
+#endif // DPLYR_DPLYR_BAD_H
diff --git a/inst/include/dplyr/check_supported_type.h b/inst/include/dplyr/check_supported_type.h
deleted file mode 100644
index ea8ccf3..0000000
--- a/inst/include/dplyr/check_supported_type.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef dplyr_check_supported_type_H
-#define dplyr_check_supported_type_H
-
-namespace dplyr {
-
- enum SupportedType {
- DPLYR_LGLSXP = LGLSXP,
- DPLYR_INTSXP = INTSXP,
- DPLYR_REALSXP = REALSXP,
- DPLYR_CPLXSXP = CPLXSXP,
- DPLYR_STRSXP = STRSXP,
- DPLYR_VECSXP = VECSXP
- };
-
- inline SupportedType check_supported_type(SEXP x, SEXP name = R_NilValue){
- switch( TYPEOF(x) ){
- case LGLSXP: return DPLYR_LGLSXP ;
- case INTSXP: return DPLYR_INTSXP ;
- case REALSXP: return DPLYR_REALSXP ;
- case CPLXSXP: return DPLYR_CPLXSXP ;
- case STRSXP: return DPLYR_STRSXP ;
- case VECSXP: return DPLYR_VECSXP ;
- default:
- if ( name == R_NilValue ) {
- stop( "Unsupported type %s", type2name(x)) ;
- }
- else {
- stop( "Unsupported type %s for column \"%s\"", type2name(x), CHAR(name)) ;
- }
-
- // Unreachable, can be removed with Rcpp > 0.12.5.2
- return DPLYR_LGLSXP ;
- }
- }
-
-}
-#endif
diff --git a/inst/include/dplyr/checks.h b/inst/include/dplyr/checks.h
new file mode 100644
index 0000000..45375bb
--- /dev/null
+++ b/inst/include/dplyr/checks.h
@@ -0,0 +1,93 @@
+#ifndef dplyr_checks_H
+#define dplyr_checks_H
+
+#include <tools/SymbolString.h>
+#include <dplyr/bad.h>
+
+namespace dplyr {
+
+enum SupportedType {
+ DPLYR_LGLSXP = LGLSXP,
+ DPLYR_INTSXP = INTSXP,
+ DPLYR_REALSXP = REALSXP,
+ DPLYR_CPLXSXP = CPLXSXP,
+ DPLYR_STRSXP = STRSXP,
+ DPLYR_VECSXP = VECSXP
+};
+
+inline std::string type_name(SEXP x) {
+ switch (TYPEOF(x)) {
+ case NILSXP:
+ return "NULL";
+ case SYMSXP:
+ return "symbol";
+ case S4SXP:
+ return "S4";
+ case LGLSXP:
+ return "logical vector";
+ case INTSXP:
+ return "integer vector";
+ case REALSXP:
+ return "double vector";
+ case STRSXP:
+ return "character vector";
+ case CPLXSXP:
+ return "complex vector";
+ case RAWSXP:
+ return "raw vector";
+ case VECSXP:
+ return "list";
+ case LANGSXP:
+ return "quoted call";
+ case EXPRSXP:
+ return "expression";
+ case ENVSXP:
+ return "environment";
+
+ case SPECIALSXP:
+ case BUILTINSXP:
+ case CLOSXP:
+ return "function";
+
+ // Everything else can fall back to R's default
+ default:
+ return std::string(Rf_type2char(TYPEOF(x)));
+ }
+}
+
+inline SupportedType check_supported_type(SEXP x, const SymbolString& name = String()) {
+ switch (TYPEOF(x)) {
+ case LGLSXP:
+ return DPLYR_LGLSXP;
+ case INTSXP:
+ return DPLYR_INTSXP;
+ case REALSXP:
+ return DPLYR_REALSXP;
+ case CPLXSXP:
+ return DPLYR_CPLXSXP;
+ case STRSXP:
+ return DPLYR_STRSXP;
+ case VECSXP:
+ return DPLYR_VECSXP;
+ default:
+ if (name.is_empty()) {
+ Rcpp::stop("is of unsupported type %s", type_name(x));
+ } else {
+ bad_col(name, "is of unsupported type {type}",
+ _["type"] = type_name(x));
+ }
+ }
+}
+
+inline void check_length(const int actual, const int expected, const char* comment, const SymbolString& name) {
+ if (actual == expected || actual == 1) return;
+
+ static Function check_length_col("check_length_col", Environment::namespace_env("dplyr"));
+ static Function identity("identity", Environment::base_env());
+ String message = check_length_col(actual, expected, CharacterVector::create(name.get_sexp()), std::string(comment), _[".abort"] = identity);
+ message.set_encoding(CE_UTF8);
+ stop(message.get_cstring());
+}
+
+}
+#endif
diff --git a/inst/include/dplyr/comparisons.h b/inst/include/dplyr/comparisons.h
index 835a4f1..e136c55 100644
--- a/inst/include/dplyr/comparisons.h
+++ b/inst/include/dplyr/comparisons.h
@@ -3,126 +3,127 @@
namespace dplyr {
- template <int RTYPE>
- struct comparisons {
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-
- inline bool is_less(STORAGE lhs, STORAGE rhs) const {
- if (is_na(lhs)) return false;
- if (is_na(rhs)) return true;
-
- return lhs < rhs ;
- }
-
- inline bool is_greater(STORAGE lhs, STORAGE rhs) const {
- return lhs > rhs ;
- }
-
- inline bool equal_or_both_na( STORAGE lhs, STORAGE rhs ) const {
- return lhs == rhs ;
- }
-
- inline bool is_na(STORAGE x) const {
- return Rcpp::traits::is_na<RTYPE>(x);
- }
-
- } ;
-
- template <>
- struct comparisons<STRSXP> {
- inline bool is_less( SEXP lhs, SEXP rhs ) const {
- // we need this because CHAR(NA_STRING) gives "NA"
- if (is_na(lhs)) return false;
- if (is_na(rhs)) return true;
- return strcmp( CHAR(lhs), CHAR(rhs) ) < 0 ;
- }
-
- inline bool is_greater( SEXP lhs, SEXP rhs) const {
- if (is_na(lhs)) return false;
- if (is_na(rhs)) return true;
- return strcmp( CHAR(lhs), CHAR(rhs) ) > 0;
- }
-
- inline bool equal_or_both_na( SEXP lhs, SEXP rhs ) const {
- return lhs == rhs ;
- }
-
- inline bool is_na(SEXP x) const {
- return Rcpp::traits::is_na<STRSXP>(x);
- }
-
- } ;
-
- // taking advantage of the particularity of NA_REAL
- template <>
- struct comparisons<REALSXP> {
-
- inline bool is_less(double lhs, double rhs) const {
- if( is_nan(lhs) ) {
- return false ;
- } else if( is_na(lhs) ){
- return is_nan(rhs) ;
- } else {
- // lhs >= rhs is false if rhs is NA or NaN
- return !( lhs >= rhs) ;
- }
-
- }
-
- inline bool is_greater(double lhs, double rhs) const {
- if( is_nan(lhs) ) {
- return false ;
- } else if( is_na(lhs) ){
- return is_nan(rhs) ;
- } else {
- // lhs <= rhs is false if rhs is NA or NaN
- return !( lhs <= rhs) ;
- }
-
- }
-
- inline bool equal_or_both_na( double lhs, double rhs ) const {
- return lhs == rhs ||
- ( is_nan(lhs) && is_nan(rhs) ) ||
- ( is_na(lhs) && is_na(rhs) );
- }
-
- inline bool is_na(double x) const {
- return ISNA(x);
- }
-
- inline bool is_nan(double x) const {
- return Rcpp::traits::is_nan<REALSXP>(x) ;
- }
-
- } ;
-
- template <>
- struct comparisons<CPLXSXP> {
-
- inline bool is_less(Rcomplex lhs, Rcomplex rhs) const {
- if (is_na(lhs)) return false;
- if (is_na(rhs)) return true;
-
- return lhs.r < rhs.r || ( lhs.r == rhs.r && lhs.i < rhs.i ) ;
- }
-
- inline bool is_greater(Rcomplex lhs, Rcomplex rhs) const {
- if (is_na(lhs)) return false;
- if (is_na(rhs)) return true;
-
- return ! ( lhs.r < rhs.r || ( lhs.r == rhs.r && lhs.i <= rhs.i ) );
- }
-
- inline bool equal_or_both_na( Rcomplex lhs, Rcomplex rhs ) const {
- return lhs.r == rhs.r && lhs.i == rhs.i ;
- }
-
- inline bool is_na(Rcomplex x) const {
- return Rcpp::traits::is_na<CPLXSXP>(x);
- }
-
- } ;
+template <int RTYPE>
+struct comparisons {
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+
+ static inline bool is_less(STORAGE lhs, STORAGE rhs) {
+ if (is_na(lhs)) return false;
+ if (is_na(rhs)) return true;
+
+ return lhs < rhs;
+ }
+
+ static inline bool is_greater(STORAGE lhs, STORAGE rhs) {
+ return lhs > rhs;
+ }
+
+ static inline bool equal_or_both_na(STORAGE lhs, STORAGE rhs) {
+ return lhs == rhs;
+ }
+
+ static inline bool is_na(STORAGE x) {
+ return Rcpp::traits::is_na<RTYPE>(x);
+ }
+
+};
+
+template <>
+struct comparisons<STRSXP> {
+ static inline bool is_less(SEXP lhs, SEXP rhs) {
+ // we need this because CHAR(NA_STRING) gives "NA"
+ if (is_na(lhs)) return false;
+ if (is_na(rhs)) return true;
+ return strcmp(CHAR(lhs), CHAR(rhs)) < 0;
+ }
+
+ static inline bool is_greater(SEXP lhs, SEXP rhs) {
+ if (is_na(lhs)) return false;
+ if (is_na(rhs)) return true;
+ return strcmp(CHAR(lhs), CHAR(rhs)) > 0;
+ }
+
+ static inline bool equal_or_both_na(SEXP lhs, SEXP rhs) {
+ return lhs == rhs;
+ }
+
+ static inline bool is_na(SEXP x) {
+ return Rcpp::traits::is_na<STRSXP>(x);
+ }
+
+};
+
+// taking advantage of the particularity of NA_REAL
+template <>
+struct comparisons<REALSXP> {
+
+ static inline bool is_less(double lhs, double rhs) {
+ if (is_nan(lhs)) {
+ return false;
+ } else if (is_na(lhs)) {
+ return is_nan(rhs);
+ } else {
+ // lhs >= rhs is false if rhs is NA or NaN
+ return !(lhs >= rhs);
+ }
+
+ }
+
+ static inline bool is_greater(double lhs, double rhs) {
+ if (is_nan(lhs)) {
+ return false;
+ } else if (is_na(lhs)) {
+ return is_nan(rhs);
+ } else {
+ // lhs <= rhs is false if rhs is NA or NaN
+ return !(lhs <= rhs);
+ }
+
+ }
+
+ static inline bool equal_or_both_na(double lhs, double rhs) {
+ return
+ lhs == rhs ||
+ (is_nan(lhs) && is_nan(rhs)) ||
+ (is_na(lhs) && is_na(rhs));
+ }
+
+ static inline bool is_na(double x) {
+ return ISNA(x);
+ }
+
+ static inline bool is_nan(double x) {
+ return Rcpp::traits::is_nan<REALSXP>(x);
+ }
+
+};
+
+template <>
+struct comparisons<CPLXSXP> {
+
+ static inline bool is_less(Rcomplex lhs, Rcomplex rhs) {
+ if (is_na(lhs)) return false;
+ if (is_na(rhs)) return true;
+
+ return lhs.r < rhs.r || (lhs.r == rhs.r && lhs.i < rhs.i);
+ }
+
+ static inline bool is_greater(Rcomplex lhs, Rcomplex rhs) {
+ if (is_na(lhs)) return false;
+ if (is_na(rhs)) return true;
+
+ return !(lhs.r < rhs.r || (lhs.r == rhs.r && lhs.i <= rhs.i));
+ }
+
+ static inline bool equal_or_both_na(Rcomplex lhs, Rcomplex rhs) {
+ return lhs.r == rhs.r && lhs.i == rhs.i;
+ }
+
+ static inline bool is_na(Rcomplex x) {
+ return Rcpp::traits::is_na<CPLXSXP>(x);
+ }
+
+};
}
diff --git a/inst/include/dplyr/comparisons_different.h b/inst/include/dplyr/comparisons_different.h
deleted file mode 100644
index 4be5092..0000000
--- a/inst/include/dplyr/comparisons_different.h
+++ /dev/null
@@ -1,92 +0,0 @@
-#ifndef dplyr_comparisons_different_H
-#define dplyr_comparisons_different_H
-
-namespace dplyr {
-
- // not defined on purpose
- template <int LHS_RTYPE, int RHS_RTYPE>
- struct comparisons_different ;
-
- // specialization when LHS_TYPE == RHS_TYPE
- template <int RTYPE>
- struct comparisons_different<RTYPE,RTYPE> : comparisons<RTYPE>{} ;
-
- // works for both LHS_RTYPE = INTSXP and LHS_RTYPE = LGLSXP
- template <int LHS_RTYPE>
- struct comparisons_int_double {
-
- inline bool is_less( int lhs, double rhs ) const {
- if( lhs == NA_INTEGER ){
- return is_nan(rhs) ;
- }
- return !( (double)lhs >= rhs ) ;
- }
-
- inline bool is_greater( int lhs, double rhs ) const {
- if( lhs == NA_INTEGER ){
- return is_nan(rhs) ;
- }
- return !( (double)lhs <= rhs ) ;
- }
-
- inline bool is_nan(double x) const {
- return Rcpp::traits::is_nan<REALSXP>(x) ;
- }
-
- inline bool equal_or_both_na( int lhs, double rhs ) const {
- if( lhs == NA_INTEGER && ISNA(rhs) ) return true ;
- return (double)lhs == rhs ;
- }
-
- } ;
-
- template <>
- struct comparisons_different<INTSXP, REALSXP> : comparisons_int_double<INTSXP>{} ;
-
- template <>
- struct comparisons_different<LGLSXP, REALSXP> : comparisons_int_double<LGLSXP>{} ;
-
-
-
- template <int LHS_RTYPE>
- struct comparisons_double_int {
-
- inline bool is_less( double lhs, int rhs ) const {
- if( is_nan(lhs) || ISNA(lhs) ) return false ;
- if( rhs == NA_INTEGER ) return true ;
- return lhs < (double)rhs ;
- }
-
- inline bool is_greater( double lhs, int rhs ) const {
- if( is_nan(lhs) || ISNA(lhs) ) return false ;
- if( rhs == NA_INTEGER ) return true ;
- return lhs > (double)rhs ;
- }
-
- inline bool is_nan(double x) const {
- return Rcpp::traits::is_nan<REALSXP>(x) ;
- }
-
- inline bool equal_or_both_na( double lhs, int rhs ) const {
- if( rhs == NA_INTEGER && ISNA(lhs) ) return true ;
- return (double)rhs == lhs ;
- }
-
- } ;
-
- template <>
- struct comparisons_different<REALSXP, INTSXP> : comparisons_double_int<INTSXP>{} ;
-
- template <>
- struct comparisons_different<REALSXP, LGLSXP> : comparisons_double_int<LGLSXP>{} ;
-
- template <>
- struct comparisons_different<INTSXP, LGLSXP> : comparisons<INTSXP>{} ;
-
- template <>
- struct comparisons_different<LGLSXP, INTSXP> : comparisons<INTSXP>{} ;
-
-}
-
-#endif
-
diff --git a/inst/include/dplyr/dplyr.h b/inst/include/dplyr/dplyr.h
new file mode 100644
index 0000000..c6af902
--- /dev/null
+++ b/inst/include/dplyr/dplyr.h
@@ -0,0 +1,36 @@
+#ifndef dplyr_dplyr_dplyr_H
+#define dplyr_dplyr_dplyr_H
+
+#include <dplyr/registration.h>
+#include <dplyr/CharacterVectorOrderer.h>
+#include <dplyr/white_list.h>
+#include <dplyr/checks.h>
+#include <dplyr/visitor_set/visitor_set.h>
+#include <dplyr/BoolResult.h>
+#include <dplyr/GroupedDataFrame.h>
+#include <dplyr/RowwiseDataFrame.h>
+
+#include <dplyr/tbl_cpp.h>
+#include <dplyr/comparisons.h>
+#include <dplyr/join_match.h>
+#include <dplyr/MultipleVectorVisitors.h>
+#include <dplyr/DataFrameSubsetVisitors.h>
+#include <dplyr/subset_visitor.h>
+#include <dplyr/subset_visitor_impl.h>
+#include <dplyr/visitor.h>
+#include <dplyr/visitor_impl.h>
+#include <dplyr/OrderVisitorImpl.h>
+#include <dplyr/JoinVisitor.h>
+#include <dplyr/JoinVisitorImpl.h>
+#include <dplyr/DataFrameJoinVisitors.h>
+#include <dplyr/Order.h>
+#include <dplyr/Hybrid.h>
+#include <dplyr/Result/all.h>
+#include <dplyr/Gatherer.h>
+#include <dplyr/Replicator.h>
+#include <dplyr/Collecter.h>
+#include <dplyr/NamedListAccumulator.h>
+#include <dplyr/train.h>
+#include <dplyr/Groups.h>
+
+#endif // #ifndef dplyr_dplyr_dplyr_H
diff --git a/inst/include/dplyr/get_column.h b/inst/include/dplyr/get_column.h
new file mode 100644
index 0000000..78bebb6
--- /dev/null
+++ b/inst/include/dplyr/get_column.h
@@ -0,0 +1,10 @@
+#ifndef dplyr_dplyr_get_column_H
+#define dplyr_dplyr_get_column_H
+
+namespace dplyr {
+
+SymbolString get_column(SEXP, const Environment&, const ILazySubsets&);
+
+}
+
+#endif // #ifndef dplyr_dplyr_get_column_H
diff --git a/inst/include/dplyr/join_match.h b/inst/include/dplyr/join_match.h
new file mode 100644
index 0000000..c5e4869
--- /dev/null
+++ b/inst/include/dplyr/join_match.h
@@ -0,0 +1,80 @@
+#ifndef dplyr_join_match_H
+#define dplyr_join_match_H
+
+#include <dplyr/comparisons.h>
+
+namespace dplyr {
+
+// not defined on purpose
+template <int LHS_RTYPE, int RHS_RTYPE, bool ACCEPT_NA_MATCH>
+struct join_match;
+
+// specialization when LHS_TYPE == RHS_TYPE
+template <int RTYPE, bool ACCEPT_NA_MATCH>
+struct join_match<RTYPE, RTYPE, ACCEPT_NA_MATCH> {
+ typedef comparisons<RTYPE> compare;
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+
+ static inline bool is_match(STORAGE lhs, STORAGE rhs) {
+ return compare::equal_or_both_na(lhs, rhs) && (ACCEPT_NA_MATCH || !compare::is_na(lhs));
+ }
+};
+
+// NaN also don't match for reals
+template <bool ACCEPT_NA_MATCH>
+struct join_match<REALSXP, REALSXP, ACCEPT_NA_MATCH> {
+ typedef comparisons<REALSXP> compare;
+
+ static inline bool is_match(double lhs, double rhs) {
+ if (ACCEPT_NA_MATCH)
+ return compare::equal_or_both_na(lhs, rhs);
+ else
+ return lhs == rhs && (ACCEPT_NA_MATCH || (!compare::is_na(lhs) && !compare::is_nan(lhs)));
+ }
+};
+
+// works for both LHS_RTYPE = INTSXP and LHS_RTYPE = LGLSXP
+template <int LHS_RTYPE, bool ACCEPT_NA_MATCH>
+struct join_match_int_double {
+ static inline bool is_match(int lhs, double rhs) {
+ LOG_VERBOSE << lhs << " " << rhs;
+ if (double(lhs) == rhs) {
+ return (lhs != NA_INTEGER);
+ }
+ else {
+ if (ACCEPT_NA_MATCH)
+ return (lhs == NA_INTEGER && ISNA(rhs));
+ else
+ return false;
+ }
+ }
+};
+
+template <bool ACCEPT_NA_MATCH>
+struct join_match<INTSXP, REALSXP, ACCEPT_NA_MATCH> : join_match_int_double<INTSXP, ACCEPT_NA_MATCH> {};
+
+template <bool ACCEPT_NA_MATCH>
+struct join_match<LGLSXP, REALSXP, ACCEPT_NA_MATCH> : join_match_int_double<LGLSXP, ACCEPT_NA_MATCH> {};
+
+template <int RHS_RTYPE, bool ACCEPT_NA_MATCH>
+struct join_match_double_int {
+ static inline bool is_match(double lhs, int rhs) {
+ return join_match_int_double<RHS_RTYPE, ACCEPT_NA_MATCH>::is_match(rhs, lhs);
+ }
+};
+
+template <bool ACCEPT_NA_MATCH>
+struct join_match<REALSXP, INTSXP, ACCEPT_NA_MATCH> : join_match_double_int<INTSXP, ACCEPT_NA_MATCH> {};
+
+template <bool ACCEPT_NA_MATCH>
+struct join_match<REALSXP, LGLSXP, ACCEPT_NA_MATCH> : join_match_double_int<LGLSXP, ACCEPT_NA_MATCH> {};
+
+template <bool ACCEPT_NA_MATCH>
+struct join_match<INTSXP, LGLSXP, ACCEPT_NA_MATCH> : join_match<INTSXP, INTSXP, ACCEPT_NA_MATCH> {};
+
+template <bool ACCEPT_NA_MATCH>
+struct join_match<LGLSXP, INTSXP, ACCEPT_NA_MATCH> : join_match<INTSXP, INTSXP, ACCEPT_NA_MATCH> {};
+
+}
+
+#endif // #ifndef dplyr_join_match_H
diff --git a/inst/include/dplyr/main.h b/inst/include/dplyr/main.h
new file mode 100644
index 0000000..c8ce4f9
--- /dev/null
+++ b/inst/include/dplyr/main.h
@@ -0,0 +1,15 @@
+#ifndef dplyr_dplyr_main_H
+#define dplyr_dplyr_main_H
+
+#include <Rcpp.h>
+#include <dplyr/workarounds.h>
+#include <dplyr/workarounds/static_assert.h>
+#include <dplyr/workarounds/xlen.h>
+#include <solaris/solaris.h>
+#include <dplyr/config.h>
+
+#include <plogr.h>
+
+using namespace Rcpp;
+
+#endif // #ifndef dplyr_dplyr_main_H
diff --git a/inst/include/dplyr/registration.h b/inst/include/dplyr/registration.h
index 70e7871..b70162d 100644
--- a/inst/include/dplyr/registration.h
+++ b/inst/include/dplyr/registration.h
@@ -1,33 +1,18 @@
#ifndef dplyr_registration_H
#define dplyr_registration_H
-#if !defined(COMPILING_DPLYR)
-
-#define GRAB_CALLABLE(__FUN__) static Fun fun = (Fun)R_GetCCallable( "dplyr", #__FUN__ ) ;
-
-inline DataFrame build_index_cpp( DataFrame data ){
- typedef DataFrame (*Fun)(DataFrame) ;
- GRAB_CALLABLE(build_index_cpp)
- return fun(data) ;
-}
-
-inline void registerHybridHandler( const char* name, HybridHandler proto){
- typedef void (*Fun)(const char*, HybridHandler ) ;
- GRAB_CALLABLE(registerHybridHandler)
- return fun(name, proto) ;
-}
-
-inline SEXP get_time_classes(){
- typedef SEXP (*Fun)(void) ;
- GRAB_CALLABLE(get_time_classes)
- return fun() ;
-}
-
-inline SEXP get_date_classes(){
- typedef SEXP (*Fun)(void) ;
- GRAB_CALLABLE(get_time_classes)
- return fun() ;
-}
+#include <dplyr/HybridHandler.h>
+
+#if defined(COMPILING_DPLYR)
+
+DataFrame build_index_cpp(DataFrame data);
+void registerHybridHandler(const char*, HybridHandler);
+SEXP get_time_classes();
+SEXP get_date_classes();
+
+#else
+
+#include "dplyr_RcppExports.h"
#endif
diff --git a/inst/include/dplyr/subset_visitor.h b/inst/include/dplyr/subset_visitor.h
index eabaeb5..0dc7da5 100644
--- a/inst/include/dplyr/subset_visitor.h
+++ b/inst/include/dplyr/subset_visitor.h
@@ -1,67 +1,11 @@
#ifndef dplyr_subset_visitor_H
#define dplyr_subset_visitor_H
-namespace dplyr {
-
-inline SubsetVectorVisitor* subset_visitor_matrix( SEXP vec );
-inline SubsetVectorVisitor* subset_visitor_vector( SEXP vec );
-
-inline SubsetVectorVisitor* subset_visitor( SEXP vec ){
- if( Rf_isMatrix( vec ) ){
- return subset_visitor_matrix(vec) ;
- }
- else {
- return subset_visitor_vector(vec) ;
- }
-}
-
-inline SubsetVectorVisitor* subset_visitor_matrix( SEXP vec ){
- switch( TYPEOF(vec) ){
- case CPLXSXP: return new MatrixColumnSubsetVisitor<CPLXSXP>( vec ) ;
- case INTSXP: return new MatrixColumnSubsetVisitor<INTSXP>( vec ) ;
- case REALSXP: return new MatrixColumnSubsetVisitor<REALSXP>( vec ) ;
- case LGLSXP: return new MatrixColumnSubsetVisitor<LGLSXP>( vec ) ;
- case STRSXP: return new MatrixColumnSubsetVisitor<STRSXP>( vec ) ;
- case VECSXP: return new MatrixColumnSubsetVisitor<VECSXP>( vec ) ;
- default: break ;
- }
+#include <dplyr/SubsetVectorVisitor.h>
- stop("Unsupported matrix type %s", Rf_type2char(TYPEOF(vec))) ;
- return 0 ;
-}
-
-inline SubsetVectorVisitor* subset_visitor_vector( SEXP vec ){
- if( Rf_inherits(vec, "Date") ){
- return new DateSubsetVectorVisitor(vec) ;
- }
-
- switch( TYPEOF(vec) ){
- case CPLXSXP:
- return new SubsetVectorVisitorImpl<CPLXSXP>( vec ) ;
- case INTSXP:
- if( Rf_inherits(vec, "factor" ))
- return new SubsetFactorVisitor( vec ) ;
- return new SubsetVectorVisitorImpl<INTSXP>( vec ) ;
- case REALSXP: return new SubsetVectorVisitorImpl<REALSXP>( vec ) ;
- case LGLSXP: return new SubsetVectorVisitorImpl<LGLSXP>( vec ) ;
- case STRSXP: return new SubsetVectorVisitorImpl<STRSXP>( vec ) ;
-
- case VECSXP: {
- if( Rf_inherits( vec, "data.frame" ) ){
- return new DataFrameColumnSubsetVisitor(vec) ;
- }
- if( Rf_inherits( vec, "POSIXlt" )) {
- stop( "POSIXlt not supported" ) ;
- }
- return new SubsetVectorVisitorImpl<VECSXP>( vec ) ;
- }
- default: break ;
- }
+namespace dplyr {
- // should not happen, safeguard against segfaults anyway
- stop("Unsupported vector type %s", Rf_type2char(TYPEOF(vec))) ;
- return 0 ;
-}
+inline SubsetVectorVisitor* subset_visitor(SEXP vec, const SymbolString& name);
}
diff --git a/inst/include/dplyr/subset_visitor_impl.h b/inst/include/dplyr/subset_visitor_impl.h
new file mode 100644
index 0000000..1c43c6e
--- /dev/null
+++ b/inst/include/dplyr/subset_visitor_impl.h
@@ -0,0 +1,88 @@
+#ifndef dplyr_subset_visitor_impl_H
+#define dplyr_subset_visitor_impl_H
+
+#include <dplyr/SubsetVectorVisitorImpl.h>
+#include <dplyr/DataFrameSubsetVisitors.h>
+#include <dplyr/DataFrameColumnSubsetVisitor.h>
+#include <dplyr/MatrixColumnSubsetVectorVisitor.h>
+#include <dplyr/bad.h>
+
+namespace dplyr {
+
+inline SubsetVectorVisitor* subset_visitor_matrix(SEXP vec);
+inline SubsetVectorVisitor* subset_visitor_vector(SEXP vec);
+
+inline SubsetVectorVisitor* subset_visitor(SEXP vec, const SymbolString& name) {
+ try {
+ if (Rf_isMatrix(vec)) {
+ return subset_visitor_matrix(vec);
+ }
+ else {
+ return subset_visitor_vector(vec);
+ }
+ }
+ catch (const Rcpp::exception& e) {
+ bad_col(name, e.what());
+ }
+}
+
+inline SubsetVectorVisitor* subset_visitor_matrix(SEXP vec) {
+ switch (TYPEOF(vec)) {
+ case CPLXSXP:
+ return new MatrixColumnSubsetVisitor<CPLXSXP>(vec);
+ case INTSXP:
+ return new MatrixColumnSubsetVisitor<INTSXP>(vec);
+ case REALSXP:
+ return new MatrixColumnSubsetVisitor<REALSXP>(vec);
+ case LGLSXP:
+ return new MatrixColumnSubsetVisitor<LGLSXP>(vec);
+ case STRSXP:
+ return new MatrixColumnSubsetVisitor<STRSXP>(vec);
+ case VECSXP:
+ return new MatrixColumnSubsetVisitor<VECSXP>(vec);
+ default:
+ break;
+ }
+
+ stop("unsupported matrix type %s", Rf_type2char(TYPEOF(vec)));
+}
+
+inline SubsetVectorVisitor* subset_visitor_vector(SEXP vec) {
+ if (Rf_inherits(vec, "Date")) {
+ return new DateSubsetVectorVisitor(vec);
+ }
+
+ switch (TYPEOF(vec)) {
+ case CPLXSXP:
+ return new SubsetVectorVisitorImpl<CPLXSXP>(vec);
+ case INTSXP:
+ if (Rf_inherits(vec, "factor"))
+ return new SubsetFactorVisitor(vec);
+ return new SubsetVectorVisitorImpl<INTSXP>(vec);
+ case REALSXP:
+ return new SubsetVectorVisitorImpl<REALSXP>(vec);
+ case LGLSXP:
+ return new SubsetVectorVisitorImpl<LGLSXP>(vec);
+ case STRSXP:
+ return new SubsetVectorVisitorImpl<STRSXP>(vec);
+
+ case VECSXP: {
+ if (Rf_inherits(vec, "data.frame")) {
+ return new DataFrameColumnSubsetVisitor(vec);
+ }
+ if (Rf_inherits(vec, "POSIXlt")) {
+ stop("POSIXlt not supported");
+ }
+ return new SubsetVectorVisitorImpl<VECSXP>(vec);
+ }
+ default:
+ break;
+ }
+
+ // should not happen, safeguard against segfaults anyway
+ stop("is of unsupported type %s", Rf_type2char(TYPEOF(vec)));
+}
+
+}
+
+#endif
diff --git a/inst/include/dplyr/tbl_cpp.h b/inst/include/dplyr/tbl_cpp.h
index ec4d5c6..f4b5e6b 100644
--- a/inst/include/dplyr/tbl_cpp.h
+++ b/inst/include/dplyr/tbl_cpp.h
@@ -1,27 +1,29 @@
#ifndef dplyr_tools_tbl_cpp_H
#define dplyr_tools_tbl_cpp_H
+#include <dplyr/RowwiseDataFrame.h>
+
namespace dplyr {
- template <typename Df>
- inline void set_rownames( Df& data, int n ){
- data.attr( "row.names" ) = Rcpp::IntegerVector::create(
- Rcpp::IntegerVector::get_na(), -n) ;
- }
-
- template <typename Data>
- inline Rcpp::CharacterVector classes_grouped(){
- return Rcpp::CharacterVector::create( "grouped_df", "tbl_df", "tbl", "data.frame") ;
- }
-
- template <>
- inline Rcpp::CharacterVector classes_grouped<RowwiseDataFrame>(){
- return Rcpp::CharacterVector::create( "rowwise_df", "tbl_df", "tbl", "data.frame") ;
- }
-
- inline Rcpp::CharacterVector classes_not_grouped(){
- return Rcpp::CharacterVector::create( "tbl_df", "tbl", "data.frame") ;
- }
+template <typename Df>
+inline void set_rownames(Df& data, int n) {
+ data.attr("row.names") =
+ Rcpp::IntegerVector::create(Rcpp::IntegerVector::get_na(), -n);
+}
+
+template <typename Data>
+inline Rcpp::CharacterVector classes_grouped() {
+ return Rcpp::CharacterVector::create("grouped_df", "tbl_df", "tbl", "data.frame");
+}
+
+template <>
+inline Rcpp::CharacterVector classes_grouped<RowwiseDataFrame>() {
+ return Rcpp::CharacterVector::create("rowwise_df", "tbl_df", "tbl", "data.frame");
+}
+
+inline Rcpp::CharacterVector classes_not_grouped() {
+ return Rcpp::CharacterVector::create("tbl_df", "tbl", "data.frame");
+}
}
diff --git a/inst/include/dplyr/train.h b/inst/include/dplyr/train.h
index 2da75d5..5fd6ae5 100644
--- a/inst/include/dplyr/train.h
+++ b/inst/include/dplyr/train.h
@@ -3,56 +3,56 @@
namespace dplyr {
- template <typename Op>
- inline void iterate_with_interupts( Op op, int n ){
- int i=0 ;
- if( n > DPLYR_MIN_INTERUPT_SIZE ){
- int m = n / DPLYR_INTERUPT_TIMES ;
- for( int k=0; k<DPLYR_INTERUPT_TIMES; k++){
- for( int j=0; j<m; j++, i++) op(i) ;
- Rcpp::checkUserInterrupt() ;
- }
- }
- for( ; i<n; i++) op(i) ;
+template <typename Op>
+inline void iterate_with_interupts(Op op, int n) {
+ int i = 0;
+ if (n > DPLYR_MIN_INTERUPT_SIZE) {
+ int m = n / DPLYR_INTERUPT_TIMES;
+ for (int k = 0; k < DPLYR_INTERUPT_TIMES; k++) {
+ for (int j = 0; j < m; j++, i++) op(i);
+ Rcpp::checkUserInterrupt();
}
+ }
+ for (; i < n; i++) op(i);
+}
- template <typename Map>
- struct push_back_op {
- push_back_op( Map& map_ ) : map(map_){}
- inline void operator()(int i){
- map[i].push_back(i) ;
- }
- Map& map ;
- } ;
-
- template <typename Map>
- struct push_back_right_op {
- push_back_right_op( Map& map_ ) : map(map_){}
- inline void operator()(int i){
- map[-i-1].push_back(-i-1) ;
- }
- Map& map ;
- } ;
-
-
- template <typename Map>
- inline void train_push_back( Map& map, int n){
- iterate_with_interupts( push_back_op<Map>(map), n) ;
- }
+template <typename Map>
+struct push_back_op {
+ push_back_op(Map& map_) : map(map_) {}
+ inline void operator()(int i) {
+ map[i].push_back(i);
+ }
+ Map& map;
+};
+
+template <typename Map>
+struct push_back_right_op {
+ push_back_right_op(Map& map_) : map(map_) {}
+ inline void operator()(int i) {
+ map[-i - 1].push_back(-i - 1);
+ }
+ Map& map;
+};
+
+
+template <typename Map>
+inline void train_push_back(Map& map, int n) {
+ iterate_with_interupts(push_back_op<Map>(map), n);
+}
- template <typename Map>
- inline void train_push_back_right( Map& map, int n){
- iterate_with_interupts( push_back_right_op<Map>(map), n) ;
- }
+template <typename Map>
+inline void train_push_back_right(Map& map, int n) {
+ iterate_with_interupts(push_back_right_op<Map>(map), n);
+}
- template <typename Set>
- inline void train_insert( Set& set, int n){
- for( int i=0; i<n; i++) set.insert(i) ;
- }
- template <typename Set>
- inline void train_insert_right( Set& set, int n){
- for( int i=0; i<n; i++) set.insert(-i-1) ;
- }
+template <typename Set>
+inline void train_insert(Set& set, int n) {
+ for (int i = 0; i < n; i++) set.insert(i);
+}
+template <typename Set>
+inline void train_insert_right(Set& set, int n) {
+ for (int i = 0; i < n; i++) set.insert(-i - 1);
+}
}
#endif
diff --git a/inst/include/dplyr/vector_class.h b/inst/include/dplyr/vector_class.h
index 196ffd6..cd11978 100644
--- a/inst/include/dplyr/vector_class.h
+++ b/inst/include/dplyr/vector_class.h
@@ -3,33 +3,33 @@
namespace dplyr {
- template <int RTYPE>
- inline std::string vector_class() ;
+template <int RTYPE>
+inline std::string vector_class();
- template <>
- inline std::string vector_class<INTSXP>(){
- return "integer" ;
- }
- template <>
- inline std::string vector_class<REALSXP>(){
- return "numeric" ;
- }
- template <>
- inline std::string vector_class<STRSXP>(){
- return "character" ;
- }
- template <>
- inline std::string vector_class<LGLSXP>(){
- return "logical" ;
- }
- template <>
- inline std::string vector_class<VECSXP>(){
- return "list" ;
- }
- template <>
- inline std::string vector_class<CPLXSXP>(){
- return "complex" ;
- }
+template <>
+inline std::string vector_class<INTSXP>() {
+ return "integer";
+}
+template <>
+inline std::string vector_class<REALSXP>() {
+ return "numeric";
+}
+template <>
+inline std::string vector_class<STRSXP>() {
+ return "character";
+}
+template <>
+inline std::string vector_class<LGLSXP>() {
+ return "logical";
+}
+template <>
+inline std::string vector_class<VECSXP>() {
+ return "list";
+}
+template <>
+inline std::string vector_class<CPLXSXP>() {
+ return "complex";
+}
}
diff --git a/inst/include/dplyr/visitor.h b/inst/include/dplyr/visitor.h
index bbfde4b..a226daa 100644
--- a/inst/include/dplyr/visitor.h
+++ b/inst/include/dplyr/visitor.h
@@ -1,64 +1,11 @@
#ifndef dplyr_visitor_H
#define dplyr_visitor_H
-namespace dplyr {
-
-inline VectorVisitor* visitor_matrix( SEXP vec ) ;
-inline VectorVisitor* visitor_vector( SEXP vec ) ;
-
-inline VectorVisitor* visitor( SEXP vec ){
- if( Rf_isMatrix( vec ) ){
- return visitor_matrix(vec) ;
- }
- else {
- return visitor_vector(vec) ;
- }
-}
-
-inline VectorVisitor* visitor_matrix( SEXP vec ){
- switch( TYPEOF(vec) ){
- case CPLXSXP: return new MatrixColumnVisitor<CPLXSXP>( vec ) ;
- case INTSXP: return new MatrixColumnVisitor<INTSXP>( vec ) ;
- case REALSXP: return new MatrixColumnVisitor<REALSXP>( vec ) ;
- case LGLSXP: return new MatrixColumnVisitor<LGLSXP>( vec ) ;
- case STRSXP: return new MatrixColumnVisitor<STRSXP>( vec ) ;
- case VECSXP: return new MatrixColumnVisitor<VECSXP>( vec ) ;
- default: break ;
- }
+#include <dplyr/VectorVisitor.h>
- stop("Unsupported matrix type %s", Rf_type2char(TYPEOF(vec))) ;
- return 0 ;
-}
-
-inline VectorVisitor* visitor_vector( SEXP vec ){
- switch( TYPEOF(vec) ){
- case CPLXSXP:
- return new VectorVisitorImpl<CPLXSXP>( vec ) ;
- case INTSXP:
- if( Rf_inherits(vec, "factor" ))
- return new FactorVisitor( vec ) ;
- return new VectorVisitorImpl<INTSXP>( vec ) ;
- case REALSXP:
- return new VectorVisitorImpl<REALSXP>( vec ) ;
- case LGLSXP: return new VectorVisitorImpl<LGLSXP>( vec ) ;
- case STRSXP: return new VectorVisitorImpl<STRSXP>( vec ) ;
-
- case VECSXP: {
- if( Rf_inherits( vec, "data.frame" ) ){
- return new DataFrameColumnVisitor(vec) ;
- }
- if( Rf_inherits( vec, "POSIXlt" )) {
- stop( "POSIXlt not supported" ) ;
- }
- return new VectorVisitorImpl<VECSXP>( vec ) ;
- }
- default: break ;
- }
+namespace dplyr {
- // should not happen, safeguard against segfaults anyway
- stop("Unsupported vector type %s", Rf_type2char(TYPEOF(vec))) ;
- return 0 ;
-}
+inline VectorVisitor* visitor(SEXP vec);
}
diff --git a/inst/include/dplyr/visitor_impl.h b/inst/include/dplyr/visitor_impl.h
new file mode 100644
index 0000000..23a5fef
--- /dev/null
+++ b/inst/include/dplyr/visitor_impl.h
@@ -0,0 +1,77 @@
+#ifndef dplyr_visitor_impl_H
+#define dplyr_visitor_impl_H
+
+#include <dplyr/VectorVisitorImpl.h>
+#include <dplyr/DataFrameColumnVisitor.h>
+#include <dplyr/MatrixColumnVisitor.h>
+
+namespace dplyr {
+
+inline VectorVisitor* visitor_matrix(SEXP vec);
+inline VectorVisitor* visitor_vector(SEXP vec);
+
+inline VectorVisitor* visitor(SEXP vec) {
+ if (Rf_isMatrix(vec)) {
+ return visitor_matrix(vec);
+ }
+ else {
+ return visitor_vector(vec);
+ }
+}
+
+inline VectorVisitor* visitor_matrix(SEXP vec) {
+ switch (TYPEOF(vec)) {
+ case CPLXSXP:
+ return new MatrixColumnVisitor<CPLXSXP>(vec);
+ case INTSXP:
+ return new MatrixColumnVisitor<INTSXP>(vec);
+ case REALSXP:
+ return new MatrixColumnVisitor<REALSXP>(vec);
+ case LGLSXP:
+ return new MatrixColumnVisitor<LGLSXP>(vec);
+ case STRSXP:
+ return new MatrixColumnVisitor<STRSXP>(vec);
+ case VECSXP:
+ return new MatrixColumnVisitor<VECSXP>(vec);
+ default:
+ break;
+ }
+
+ stop("unsupported matrix type %s", Rf_type2char(TYPEOF(vec)));
+}
+
+inline VectorVisitor* visitor_vector(SEXP vec) {
+ switch (TYPEOF(vec)) {
+ case CPLXSXP:
+ return new VectorVisitorImpl<CPLXSXP>(vec);
+ case INTSXP:
+ if (Rf_inherits(vec, "factor"))
+ return new FactorVisitor(vec);
+ return new VectorVisitorImpl<INTSXP>(vec);
+ case REALSXP:
+ return new VectorVisitorImpl<REALSXP>(vec);
+ case LGLSXP:
+ return new VectorVisitorImpl<LGLSXP>(vec);
+ case STRSXP:
+ return new VectorVisitorImpl<STRSXP>(vec);
+
+ case VECSXP: {
+ if (Rf_inherits(vec, "data.frame")) {
+ return new DataFrameColumnVisitor(vec);
+ }
+ if (Rf_inherits(vec, "POSIXlt")) {
+ stop("POSIXlt not supported");
+ }
+ return new VectorVisitorImpl<VECSXP>(vec);
+ }
+ default:
+ break;
+ }
+
+ // should not happen, safeguard against segfaults anyway
+ stop("is of unsupported type %s", Rf_type2char(TYPEOF(vec)));
+}
+
+}
+
+#endif
diff --git a/inst/include/dplyr/visitor_set/VisitorEqualPredicate.h b/inst/include/dplyr/visitor_set/VisitorEqualPredicate.h
index 32820c9..259d88d 100644
--- a/inst/include/dplyr/visitor_set/VisitorEqualPredicate.h
+++ b/inst/include/dplyr/visitor_set/VisitorEqualPredicate.h
@@ -1,20 +1,20 @@
#ifndef dplyr_VisitorEqualPredicate_H
#define dplyr_VisitorEqualPredicate_H
-namespace dplyr{
+namespace dplyr {
- template <typename Visitor>
- class VisitorEqualPredicate{
- public:
- VisitorEqualPredicate( const Visitor& v_ ) : v(v_){}
+template <typename Visitor>
+class VisitorEqualPredicate {
+public:
+ VisitorEqualPredicate(const Visitor& v_) : v(v_) {}
- inline bool operator()(int i, int j) const {
- return v.equal_or_both_na(i, j) ;
- }
+ inline bool operator()(int i, int j) const {
+ return v.equal_or_both_na(i, j);
+ }
- private:
- const Visitor& v ;
- };
+private:
+ const Visitor& v;
+};
}
#endif
diff --git a/inst/include/dplyr/visitor_set/VisitorHash.h b/inst/include/dplyr/visitor_set/VisitorHash.h
index f051a41..ce5aef1 100644
--- a/inst/include/dplyr/visitor_set/VisitorHash.h
+++ b/inst/include/dplyr/visitor_set/VisitorHash.h
@@ -1,20 +1,20 @@
#ifndef dplyr_VisitorHash_H
#define dplyr_VisitorHash_H
-namespace dplyr{
+namespace dplyr {
- template <typename Visitor>
- class VisitorHash{
- public:
- VisitorHash( const Visitor& v_ ) : v(v_){}
+template <typename Visitor>
+class VisitorHash {
+public:
+ VisitorHash(const Visitor& v_) : v(v_) {}
- inline size_t operator()(int i) const {
- return v.hash(i) ;
- }
+ inline size_t operator()(int i) const {
+ return v.hash(i);
+ }
- private:
- const Visitor& v ;
- };
+private:
+ const Visitor& v;
+};
}
#endif
diff --git a/inst/include/dplyr/visitor_set/VisitorSetEqual.h b/inst/include/dplyr/visitor_set/VisitorSetEqual.h
index d6a6516..3959edb 100644
--- a/inst/include/dplyr/visitor_set/VisitorSetEqual.h
+++ b/inst/include/dplyr/visitor_set/VisitorSetEqual.h
@@ -1,29 +1,29 @@
#ifndef dplyr_VisitorSetEqual_H
#define dplyr_VisitorSetEqual_H
-namespace dplyr{
+namespace dplyr {
template <typename Class>
class VisitorSetEqual {
public:
- bool equal( int i, int j) const {
- const Class& obj = static_cast<const Class&>(*this) ;
- if( i == j ) return true ;
- int n=obj.size() ;
- for( int k=0; k<n; k++)
- if( ! obj.get(k)->equal(i,j) ) return false ;
- return true ;
- }
+ bool equal(int i, int j) const {
+ const Class& obj = static_cast<const Class&>(*this);
+ if (i == j) return true;
+ int n = obj.size();
+ for (int k = 0; k < n; k++)
+ if (! obj.get(k)->equal(i, j)) return false;
+ return true;
+ }
- bool equal_or_both_na( int i, int j) const {
- const Class& obj = static_cast<const Class&>(*this) ;
- if( i == j ) return true ;
- int n=obj.size() ;
- for( int k=0; k<n; k++)
- if( ! obj.get(k)->equal_or_both_na(i,j) ) return false ;
- return true ;
- }
-} ;
+ bool equal_or_both_na(int i, int j) const {
+ const Class& obj = static_cast<const Class&>(*this);
+ if (i == j) return true;
+ int n = obj.size();
+ for (int k = 0; k < n; k++)
+ if (! obj.get(k)->equal_or_both_na(i, j)) return false;
+ return true;
+ }
+};
}
diff --git a/inst/include/dplyr/visitor_set/VisitorSetEqualPredicate.h b/inst/include/dplyr/visitor_set/VisitorSetEqualPredicate.h
index 0c100db..6086dd2 100644
--- a/inst/include/dplyr/visitor_set/VisitorSetEqualPredicate.h
+++ b/inst/include/dplyr/visitor_set/VisitorSetEqualPredicate.h
@@ -1,21 +1,21 @@
#ifndef dplyr_VisitorSetEqualPredicate_H
#define dplyr_VisitorSetEqualPredicate_H
-namespace dplyr{
+namespace dplyr {
- template <typename VisitorSet>
- class VisitorSetEqualPredicate {
- public:
- VisitorSetEqualPredicate() : visitors(0){}
+template <typename VisitorSet>
+class VisitorSetEqualPredicate {
+public:
+ VisitorSetEqualPredicate() : visitors(0) {}
- VisitorSetEqualPredicate( VisitorSet* visitors_ ) : visitors(visitors_) {} ;
- inline bool operator()(int i, int j) const {
- return visitors->equal(i,j) ;
- }
+ VisitorSetEqualPredicate(VisitorSet* visitors_) : visitors(visitors_) {};
+ inline bool operator()(int i, int j) const {
+ return visitors->equal(i, j);
+ }
- private:
- VisitorSet* visitors ;
- } ;
+private:
+ VisitorSet* visitors;
+};
}
diff --git a/inst/include/dplyr/visitor_set/VisitorSetGreater.h b/inst/include/dplyr/visitor_set/VisitorSetGreater.h
index a05ea3b..9a08c0b 100644
--- a/inst/include/dplyr/visitor_set/VisitorSetGreater.h
+++ b/inst/include/dplyr/visitor_set/VisitorSetGreater.h
@@ -1,26 +1,26 @@
#ifndef dplyr_VisitorSetGreater_H
#define dplyr_VisitorSetGreater_H
-namespace dplyr{
+namespace dplyr {
template <typename Class>
class VisitorSetGreater {
public:
- bool greater( int i, int j) const {
- if( i == j ) return false ;
- const Class& obj = static_cast<const Class&>(*this) ;
- int n=obj.size();
- for( int k=0; k<n; k++){
- typename Class::visitor_type* visitor = obj.get(k) ;
- if( ! visitor->equal(i,j) ){
- return visitor->greater(i,j) ;
- }
- }
- // if we end up here, it means rows i and j are equal
- // we break the tie using the indices
- return i < j ;
+ bool greater(int i, int j) const {
+ if (i == j) return false;
+ const Class& obj = static_cast<const Class&>(*this);
+ int n = obj.size();
+ for (int k = 0; k < n; k++) {
+ typename Class::visitor_type* visitor = obj.get(k);
+ if (! visitor->equal(i, j)) {
+ return visitor->greater(i, j);
+ }
}
-} ;
+ // if we end up here, it means rows i and j are equal
+ // we break the tie using the indices
+ return i < j;
+ }
+};
}
diff --git a/inst/include/dplyr/visitor_set/VisitorSetGreaterPredicate.h b/inst/include/dplyr/visitor_set/VisitorSetGreaterPredicate.h
deleted file mode 100644
index 235e968..0000000
--- a/inst/include/dplyr/visitor_set/VisitorSetGreaterPredicate.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef dplyr_VisitorSetGreaterPredicate_H
-#define dplyr_VisitorSetGreaterPredicate_H
-
-namespace dplyr{
-
- template <typename VisitorSet>
- class VisitorSetGreaterPredicate {
- public:
- VisitorSetGreaterPredicate( const VisitorSet& visitors_ ) : visitors(visitors_) {} ;
- inline bool operator()(int i, int j) const {
- return visitors.greater(i,j) ;
- }
-
- private:
- const VisitorSet& visitors ;
- } ;
-
-}
-
-#endif
diff --git a/inst/include/dplyr/visitor_set/VisitorSetHash.h b/inst/include/dplyr/visitor_set/VisitorSetHash.h
index c79fee3..158427e 100644
--- a/inst/include/dplyr/visitor_set/VisitorSetHash.h
+++ b/inst/include/dplyr/visitor_set/VisitorSetHash.h
@@ -1,24 +1,26 @@
#ifndef dplyr_VisitorSetHash_H
#define dplyr_VisitorSetHash_H
-namespace dplyr{
+#include <tools/hash.h>
- template <typename Class>
- class VisitorSetHash {
- public:
- size_t hash( int j) const {
- const Class& obj = static_cast<const Class&>(*this) ;
- int n = obj.size() ;
- if( n == 0 ){
- stop("need at least one column for hash()") ;
- }
- size_t seed = obj.get(0)->hash(j) ;
- for( int k=1; k<n; k++){
- boost::hash_combine( seed, obj.get(k)->hash(j) ) ;
- }
- return seed ;
- }
- } ;
+namespace dplyr {
+
+template <typename Class>
+class VisitorSetHash {
+public:
+ size_t hash(int j) const {
+ const Class& obj = static_cast<const Class&>(*this);
+ int n = obj.size();
+ if (n == 0) {
+ stop("Need at least one column for `hash()`");
+ }
+ size_t seed = obj.get(0)->hash(j);
+ for (int k = 1; k < n; k++) {
+ boost::hash_combine(seed, obj.get(k)->hash(j));
+ }
+ return seed;
+ }
+};
}
diff --git a/inst/include/dplyr/visitor_set/VisitorSetHasher.h b/inst/include/dplyr/visitor_set/VisitorSetHasher.h
index 01df524..c80b3f6 100644
--- a/inst/include/dplyr/visitor_set/VisitorSetHasher.h
+++ b/inst/include/dplyr/visitor_set/VisitorSetHasher.h
@@ -1,21 +1,21 @@
#ifndef dplyr_VisitorSetHasher_H
#define dplyr_VisitorSetHasher_H
-namespace dplyr{
+namespace dplyr {
- template <typename VisitorSet>
- class VisitorSetHasher {
- public:
- VisitorSetHasher() : visitors(0){}
+template <typename VisitorSet>
+class VisitorSetHasher {
+public:
+ VisitorSetHasher() : visitors(0) {}
- VisitorSetHasher( VisitorSet* visitors_ ) : visitors(visitors_){} ;
- inline size_t operator()(int i) const {
- return visitors->hash(i) ;
- }
+ VisitorSetHasher(VisitorSet* visitors_) : visitors(visitors_) {};
+ inline size_t operator()(int i) const {
+ return visitors->hash(i);
+ }
- private:
- VisitorSet* visitors ;
- } ;
+private:
+ VisitorSet* visitors;
+};
}
diff --git a/inst/include/dplyr/visitor_set/VisitorSetIndexMap.h b/inst/include/dplyr/visitor_set/VisitorSetIndexMap.h
index 1c59509..324fd3d 100644
--- a/inst/include/dplyr/visitor_set/VisitorSetIndexMap.h
+++ b/inst/include/dplyr/visitor_set/VisitorSetIndexMap.h
@@ -1,32 +1,37 @@
#ifndef dplyr_VisitorSetIndexMap_H
#define dplyr_VisitorSetIndexMap_H
-namespace dplyr{
+#include <tools/hash.h>
- template <typename VisitorSet, typename VALUE>
- class VisitorSetIndexMap :
- public dplyr_hash_map<int, VALUE, VisitorSetHasher<VisitorSet> , VisitorSetEqualPredicate<VisitorSet> > {
- private:
- typedef VisitorSetHasher<VisitorSet> Hasher ;
- typedef VisitorSetEqualPredicate<VisitorSet> EqualPredicate ;
- typedef typename dplyr_hash_map<int, VALUE, Hasher, EqualPredicate> Base ;
+#include <dplyr/visitor_set/VisitorSetHasher.h>
+#include <dplyr/visitor_set/VisitorSetEqualPredicate.h>
- public:
- VisitorSetIndexMap() : Base(), visitors(0) {}
+namespace dplyr {
- VisitorSetIndexMap( VisitorSet& visitors_ ) :
- Base( 1024, Hasher(&visitors_), EqualPredicate(&visitors_) ),
- visitors(&visitors_)
- {}
+template <typename VisitorSet, typename VALUE>
+class VisitorSetIndexMap :
+ public dplyr_hash_map<int, VALUE, VisitorSetHasher<VisitorSet>, VisitorSetEqualPredicate<VisitorSet> > {
+private:
+ typedef VisitorSetHasher<VisitorSet> Hasher;
+ typedef VisitorSetEqualPredicate<VisitorSet> EqualPredicate;
+ typedef typename dplyr_hash_map<int, VALUE, Hasher, EqualPredicate> Base;
- VisitorSetIndexMap( VisitorSet* visitors_ ) :
- Base( 1024, Hasher(visitors_), EqualPredicate(visitors_) ),
- visitors(visitors_)
- {}
+public:
+ VisitorSetIndexMap() : Base(), visitors(0) {}
- VisitorSet* visitors ;
+ VisitorSetIndexMap(VisitorSet& visitors_) :
+ Base(1024, Hasher(&visitors_), EqualPredicate(&visitors_)),
+ visitors(&visitors_)
+ {}
- } ;
+ VisitorSetIndexMap(VisitorSet* visitors_) :
+ Base(1024, Hasher(visitors_), EqualPredicate(visitors_)),
+ visitors(visitors_)
+ {}
+
+ VisitorSet* visitors;
+
+};
}
diff --git a/inst/include/dplyr/visitor_set/VisitorSetIndexSet.h b/inst/include/dplyr/visitor_set/VisitorSetIndexSet.h
index 8be31b6..3b1e551 100644
--- a/inst/include/dplyr/visitor_set/VisitorSetIndexSet.h
+++ b/inst/include/dplyr/visitor_set/VisitorSetIndexSet.h
@@ -1,25 +1,30 @@
#ifndef dplyr_VisitorSetIndexSet_H
#define dplyr_VisitorSetIndexSet_H
-namespace dplyr{
+#include <tools/hash.h>
- template <typename VisitorSet>
- class VisitorSetIndexSet : public dplyr_hash_set<int, VisitorSetHasher<VisitorSet>, VisitorSetEqualPredicate<VisitorSet> > {
- private:
- typedef VisitorSetHasher<VisitorSet> Hasher ;
- typedef VisitorSetEqualPredicate<VisitorSet> EqualPredicate ;
- typedef dplyr_hash_set<int, Hasher, EqualPredicate> Base ;
+#include <dplyr/visitor_set/VisitorSetHasher.h>
+#include <dplyr/visitor_set/VisitorSetEqualPredicate.h>
- public:
- VisitorSetIndexSet() : Base(){}
+namespace dplyr {
- VisitorSetIndexSet( VisitorSet& visitors_ ) :
- Base( 1024, Hasher(&visitors_), EqualPredicate(&visitors_) )
- {}
- VisitorSetIndexSet( VisitorSet* visitors_ ) :
- Base( 1024, Hasher(visitors_), EqualPredicate(visitors_) )
- {}
- } ;
+template <typename VisitorSet>
+class VisitorSetIndexSet : public dplyr_hash_set<int, VisitorSetHasher<VisitorSet>, VisitorSetEqualPredicate<VisitorSet> > {
+private:
+ typedef VisitorSetHasher<VisitorSet> Hasher;
+ typedef VisitorSetEqualPredicate<VisitorSet> EqualPredicate;
+ typedef dplyr_hash_set<int, Hasher, EqualPredicate> Base;
+
+public:
+ VisitorSetIndexSet() : Base() {}
+
+ VisitorSetIndexSet(VisitorSet& visitors_) :
+ Base(1024, Hasher(&visitors_), EqualPredicate(&visitors_))
+ {}
+ VisitorSetIndexSet(VisitorSet* visitors_) :
+ Base(1024, Hasher(visitors_), EqualPredicate(visitors_))
+ {}
+};
}
#endif
diff --git a/inst/include/dplyr/visitor_set/VisitorSetLess.h b/inst/include/dplyr/visitor_set/VisitorSetLess.h
index 7741f1f..b1af8ec 100644
--- a/inst/include/dplyr/visitor_set/VisitorSetLess.h
+++ b/inst/include/dplyr/visitor_set/VisitorSetLess.h
@@ -1,26 +1,26 @@
#ifndef dplyr_VisitorSetLess_H
#define dplyr_VisitorSetLess_H
-namespace dplyr{
+namespace dplyr {
template <typename Class>
class VisitorSetLess {
public:
- bool less( int i, int j) const {
- if( i == j ) return false ;
- const Class& obj = static_cast<const Class&>(*this) ;
- int n=obj.size();
- for( int k=0; k<n; k++){
- typename Class::visitor_type* visitor = obj.get(k) ;
- if( ! visitor->equal(i,j) ){
- return visitor->less(i,j) ;
- }
- }
- // if we end up here, it means rows i and j are equal
- // we break the tie using the indices
- return i < j ;
+ bool less(int i, int j) const {
+ if (i == j) return false;
+ const Class& obj = static_cast<const Class&>(*this);
+ int n = obj.size();
+ for (int k = 0; k < n; k++) {
+ typename Class::visitor_type* visitor = obj.get(k);
+ if (! visitor->equal(i, j)) {
+ return visitor->less(i, j);
+ }
}
-} ;
+ // if we end up here, it means rows i and j are equal
+ // we break the tie using the indices
+ return i < j;
+ }
+};
}
diff --git a/inst/include/dplyr/visitor_set/VisitorSetLessPredicate.h b/inst/include/dplyr/visitor_set/VisitorSetLessPredicate.h
deleted file mode 100644
index a0033bb..0000000
--- a/inst/include/dplyr/visitor_set/VisitorSetLessPredicate.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef dplyr_VisitorSetLessPredicate_H
-#define dplyr_VisitorSetLessPredicate_H
-
-namespace dplyr{
-
- template <typename VisitorSet>
- class VisitorSetLessPredicate {
- public:
- VisitorSetLessPredicate( const VisitorSet& visitors_ ) : visitors(visitors_) {} ;
- inline bool operator()(int i, int j) const {
- return visitors.less(i,j) ;
- }
-
- private:
- const VisitorSet& visitors ;
- } ;
-
-}
-
-#endif
diff --git a/inst/include/dplyr/visitor_set/VisitorSetMixin.h b/inst/include/dplyr/visitor_set/VisitorSetMixin.h
new file mode 100644
index 0000000..902a38c
--- /dev/null
+++ b/inst/include/dplyr/visitor_set/VisitorSetMixin.h
@@ -0,0 +1,9 @@
+#ifndef dplyr_dplyr_visitor_set_VisitorSetMixin_H
+#define dplyr_dplyr_visitor_set_VisitorSetMixin_H
+
+#include <dplyr/visitor_set/VisitorSetEqual.h>
+#include <dplyr/visitor_set/VisitorSetHash.h>
+#include <dplyr/visitor_set/VisitorSetLess.h>
+#include <dplyr/visitor_set/VisitorSetGreater.h>
+
+#endif // #ifndef dplyr_dplyr_visitor_set_VisitorSetMixin_H
diff --git a/inst/include/dplyr/visitor_set/visitor_set.h b/inst/include/dplyr/visitor_set/visitor_set.h
index 4b6e0fe..de64e1c 100644
--- a/inst/include/dplyr/visitor_set/visitor_set.h
+++ b/inst/include/dplyr/visitor_set/visitor_set.h
@@ -1,19 +1,6 @@
#ifndef dplyr_visitor_set_H
#define dplyr_visitor_set_H
-#include <dplyr/visitor_set/VisitorEqualPredicate.h>
-#include <dplyr/visitor_set/VisitorHash.h>
-
-#include <dplyr/visitor_set/VisitorSetEqual.h>
-#include <dplyr/visitor_set/VisitorSetHash.h>
-#include <dplyr/visitor_set/VisitorSetLess.h>
-#include <dplyr/visitor_set/VisitorSetGreater.h>
-
-#include <dplyr/visitor_set/VisitorSetHasher.h>
-#include <dplyr/visitor_set/VisitorSetEqualPredicate.h>
-#include <dplyr/visitor_set/VisitorSetLessPredicate.h>
-#include <dplyr/visitor_set/VisitorSetGreaterPredicate.h>
-
#include <dplyr/visitor_set/VisitorSetIndexSet.h>
#include <dplyr/visitor_set/VisitorSetIndexMap.h>
diff --git a/inst/include/dplyr/white_list.h b/inst/include/dplyr/white_list.h
index 17c5604..808661a 100644
--- a/inst/include/dplyr/white_list.h
+++ b/inst/include/dplyr/white_list.h
@@ -1,28 +1,34 @@
#ifndef dplyr_white_list_H
#define dplyr_white_list_H
-namespace dplyr{
+namespace dplyr {
- inline bool white_list(SEXP x){
- if( Rf_isMatrix(x) ) {
- // might have to refine later
- return true ;
- }
- switch( TYPEOF(x) ){
- case INTSXP: return true ;
- case REALSXP: return true ;
- case LGLSXP: return true ;
- case STRSXP: return true ;
- case CPLXSXP: return true ;
- case VECSXP: {
- if( Rf_inherits( x, "POSIXlt") ) return false ;
- return true ;
- }
+inline bool white_list(SEXP x) {
+ if (Rf_isMatrix(x)) {
+ // might have to refine later
+ return true;
+ }
+ switch (TYPEOF(x)) {
+ case INTSXP:
+ return true;
+ case REALSXP:
+ return true;
+ case LGLSXP:
+ return true;
+ case STRSXP:
+ return true;
+ case CPLXSXP:
+ return true;
+ case VECSXP: {
+ if (Rf_inherits(x, "POSIXlt")) return false;
+ return true;
+ }
- default: break ;
- }
- return false ;
- }
+ default:
+ break;
+ }
+ return false;
+}
}
#endif
diff --git a/inst/include/dplyr/workarounds.h b/inst/include/dplyr/workarounds.h
index 71b231b..4d2d404 100644
--- a/inst/include/dplyr/workarounds.h
+++ b/inst/include/dplyr/workarounds.h
@@ -3,8 +3,8 @@
// installChar was introduced in R 3.2.0
#ifndef installChar
- #define installChar(x) Rf_install(CHAR(x))
- #define Rf_installChar installChar
+#define installChar(x) Rf_install(CHAR(x))
+#define Rf_installChar installChar
#endif
#endif
diff --git a/inst/include/dplyr/workarounds/static_assert.h b/inst/include/dplyr/workarounds/static_assert.h
index 9fc25b5..f5b8b25 100644
--- a/inst/include/dplyr/workarounds/static_assert.h
+++ b/inst/include/dplyr/workarounds/static_assert.h
@@ -81,7 +81,7 @@
# endif
#else
-namespace boost{
+namespace boost {
// HP aCC cannot deal with missing names for template value parameters
template <bool x> struct STATIC_ASSERTION_FAILURE;
@@ -89,7 +89,7 @@ template <bool x> struct STATIC_ASSERTION_FAILURE;
template <> struct STATIC_ASSERTION_FAILURE<true> { enum { value = 1 }; };
// HP aCC cannot deal with missing names for template value parameters
-template<int x> struct static_assert_test{};
+template<int x> struct static_assert_test {};
}
diff --git a/inst/include/dplyr/workarounds/xlen.h b/inst/include/dplyr/workarounds/xlen.h
new file mode 100644
index 0000000..466b66b
--- /dev/null
+++ b/inst/include/dplyr/workarounds/xlen.h
@@ -0,0 +1,22 @@
+#ifndef DPLYR_WORKAROUND_XLEN_H
+#define DPLYR_WORKAROUND_XLEN_H
+
+#ifdef LONG_VECTOR_SUPPORT
+
+namespace Rcpp {
+
+template <>
+inline SEXP wrap(const R_xlen_t& x) {
+ if (x < -R_SHORT_LEN_MAX || x > R_SHORT_LEN_MAX) {
+ return Rf_ScalarReal(static_cast<double>(x));
+ }
+ else {
+ return Rf_ScalarInteger(static_cast<int>(x));
+ }
+}
+
+}
+
+#endif
+
+#endif
diff --git a/inst/include/dplyr_RcppExports.h b/inst/include/dplyr_RcppExports.h
new file mode 100644
index 0000000..f35267c
--- /dev/null
+++ b/inst/include/dplyr_RcppExports.h
@@ -0,0 +1,87 @@
+// Generated by using Rcpp::compileAttributes() -> do not edit by hand
+// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+#ifndef RCPP_dplyr_RCPPEXPORTS_H_GEN_
+#define RCPP_dplyr_RCPPEXPORTS_H_GEN_
+
+#include "dplyr_types.h"
+#include <Rcpp.h>
+
+namespace dplyr {
+
+ using namespace Rcpp;
+
+ namespace {
+ void validateSignature(const char* sig) {
+ Rcpp::Function require = Rcpp::Environment::base_env()["require"];
+ require("dplyr", Rcpp::Named("quietly") = true);
+ typedef int(*Ptr_validate)(const char*);
+ static Ptr_validate p_validate = (Ptr_validate)
+ R_GetCCallable("dplyr", "_dplyr_RcppExport_validate");
+ if (!p_validate(sig)) {
+ throw Rcpp::function_not_exported(
+ "C++ function with signature '" + std::string(sig) + "' not found in dplyr");
+ }
+ }
+ }
+
+ inline SEXP get_date_classes() {
+ typedef SEXP(*Ptr_get_date_classes)();
+ static Ptr_get_date_classes p_get_date_classes = NULL;
+ if (p_get_date_classes == NULL) {
+ validateSignature("SEXP(*get_date_classes)()");
+ p_get_date_classes = (Ptr_get_date_classes)R_GetCCallable("dplyr", "_dplyr_get_date_classes");
+ }
+ RObject rcpp_result_gen;
+ {
+ RNGScope RCPP_rngScope_gen;
+ rcpp_result_gen = p_get_date_classes();
+ }
+ if (rcpp_result_gen.inherits("interrupted-error"))
+ throw Rcpp::internal::InterruptedException();
+ if (rcpp_result_gen.inherits("try-error"))
+ throw Rcpp::exception(as<std::string>(rcpp_result_gen).c_str());
+ return Rcpp::as<SEXP >(rcpp_result_gen);
+ }
+
+ inline SEXP get_time_classes() {
+ typedef SEXP(*Ptr_get_time_classes)();
+ static Ptr_get_time_classes p_get_time_classes = NULL;
+ if (p_get_time_classes == NULL) {
+ validateSignature("SEXP(*get_time_classes)()");
+ p_get_time_classes = (Ptr_get_time_classes)R_GetCCallable("dplyr", "_dplyr_get_time_classes");
+ }
+ RObject rcpp_result_gen;
+ {
+ RNGScope RCPP_rngScope_gen;
+ rcpp_result_gen = p_get_time_classes();
+ }
+ if (rcpp_result_gen.inherits("interrupted-error"))
+ throw Rcpp::internal::InterruptedException();
+ if (rcpp_result_gen.inherits("try-error"))
+ throw Rcpp::exception(as<std::string>(rcpp_result_gen).c_str());
+ return Rcpp::as<SEXP >(rcpp_result_gen);
+ }
+
+ inline DataFrame build_index_cpp(DataFrame data) {
+ typedef SEXP(*Ptr_build_index_cpp)(SEXP);
+ static Ptr_build_index_cpp p_build_index_cpp = NULL;
+ if (p_build_index_cpp == NULL) {
+ validateSignature("DataFrame(*build_index_cpp)(DataFrame)");
+ p_build_index_cpp = (Ptr_build_index_cpp)R_GetCCallable("dplyr", "_dplyr_build_index_cpp");
+ }
+ RObject rcpp_result_gen;
+ {
+ RNGScope RCPP_rngScope_gen;
+ rcpp_result_gen = p_build_index_cpp(Shield<SEXP>(Rcpp::wrap(data)));
+ }
+ if (rcpp_result_gen.inherits("interrupted-error"))
+ throw Rcpp::internal::InterruptedException();
+ if (rcpp_result_gen.inherits("try-error"))
+ throw Rcpp::exception(as<std::string>(rcpp_result_gen).c_str());
+ return Rcpp::as<DataFrame >(rcpp_result_gen);
+ }
+
+}
+
+#endif // RCPP_dplyr_RCPPEXPORTS_H_GEN_
diff --git a/inst/include/dplyr_types.h b/inst/include/dplyr_types.h
new file mode 100644
index 0000000..f419612
--- /dev/null
+++ b/inst/include/dplyr_types.h
@@ -0,0 +1,8 @@
+#include <dplyr/main.h>
+#include <tools/Quosure.h>
+#include <dplyr/registration.h>
+#include <dplyr/BoolResult.h>
+#include <dplyr/GroupedDataFrame.h>
+
+// avoid inclusion of package header file
+#define dplyr_dplyr_H
diff --git a/inst/include/solaris/solaris.h b/inst/include/solaris/solaris.h
index e6d1146..ed9cc13 100644
--- a/inst/include/solaris/solaris.h
+++ b/inst/include/solaris/solaris.h
@@ -3,17 +3,17 @@
#if defined(__SUNPRO_CC) && !defined(Rcpp__platform__solaris_h)
-namespace Rcpp{
-namespace traits{
+namespace Rcpp {
+namespace traits {
- template <typename T> struct is_convertible< std::vector<T>, SEXP> : public false_type{} ;
- template <> struct is_convertible<Range,SEXP> : public false_type{} ;
+template <typename T> struct is_convertible< std::vector<T>, SEXP> : public false_type {};
+template <> struct is_convertible<Range, SEXP> : public false_type {};
- template <int RTYPE, bool NA>
- struct is_convertible< sugar::Minus_Vector_Primitive< RTYPE, NA, Vector<RTYPE> >, SEXP> : public false_type{} ;
+template <int RTYPE, bool NA>
+struct is_convertible< sugar::Minus_Vector_Primitive< RTYPE, NA, Vector<RTYPE> >, SEXP> : public false_type {};
- template <int RTYPE, bool NA>
- struct is_convertible< sugar::Plus_Vector_Primitive< RTYPE, NA, Vector<RTYPE> >, SEXP> : public false_type{} ;
+template <int RTYPE, bool NA>
+struct is_convertible< sugar::Plus_Vector_Primitive< RTYPE, NA, Vector<RTYPE> >, SEXP> : public false_type {};
}
}
diff --git a/inst/include/tools/Call.h b/inst/include/tools/Call.h
index cf274f8..9a57a4d 100644
--- a/inst/include/tools/Call.h
+++ b/inst/include/tools/Call.h
@@ -3,50 +3,50 @@
namespace Rcpp {
- class Call {
- public:
-
- Call() : data(R_NilValue){}
-
- Call(SEXP x) : data(x){
- if( data != R_NilValue ) R_PreserveObject(data) ;
- }
-
- ~Call(){
- if( data != R_NilValue ) R_ReleaseObject(data) ;
- }
-
- Call( const Call& other ) : data(other.data){
- if( data != R_NilValue ) R_PreserveObject(data) ;
- }
-
- Call& operator=( SEXP other ){
- if( other != data ){
- if( data != R_NilValue ) R_ReleaseObject(data) ;
- data = other ;
- if( data != R_NilValue ) R_PreserveObject(data) ;
- }
- return *this ;
- }
-
- inline SEXP eval(SEXP env) const {
- return Rcpp_eval(data, env) ;
- }
-
- inline operator SEXP() const{
- return data ;
- }
-
- private:
- SEXP data ;
-
- Call& operator=( const Call& other) ;
- // {
- // *this = other.data ;
- // return *this ;
- // }
-
- } ;
+class Call {
+public:
+
+ Call() : data(R_NilValue) {}
+
+ Call(SEXP x) : data(x) {
+ if (data != R_NilValue) R_PreserveObject(data);
+ }
+
+ ~Call() {
+ if (data != R_NilValue) R_ReleaseObject(data);
+ }
+
+ Call(const Call& other) : data(other.data) {
+ if (data != R_NilValue) R_PreserveObject(data);
+ }
+
+ Call& operator=(SEXP other) {
+ if (other != data) {
+ if (data != R_NilValue) R_ReleaseObject(data);
+ data = other;
+ if (data != R_NilValue) R_PreserveObject(data);
+ }
+ return *this;
+ }
+
+ inline SEXP eval(SEXP env) const {
+ return Rcpp_eval(data, env);
+ }
+
+ inline operator SEXP() const {
+ return data;
+ }
+
+private:
+ SEXP data;
+
+ Call& operator=(const Call& other);
+ // {
+ // *this = other.data;
+ // return *this;
+ // }
+
+};
}
diff --git a/inst/include/tools/DotsOf.h b/inst/include/tools/DotsOf.h
deleted file mode 100644
index 366dc12..0000000
--- a/inst/include/tools/DotsOf.h
+++ /dev/null
@@ -1,105 +0,0 @@
-#ifndef dplyr__DotsOf_h
-#define dplyr__DotsOf_h
-
-namespace Rcpp {
-
- template <typename T>
- class DotsOf {
- public:
-
- DotsOf( Environment env ) : data() {
- SEXP dots = env.find( "..." );
- if( dots != R_MissingArg ) collect(dots) ;
- }
-
- inline T& operator[](int i){
- return data[i] ;
- }
-
- inline int size() const{
- return data.size();
- }
-
- private:
- std::vector<T> data ;
-
- void collect( SEXP dots){
- int np = 0 ;
-
- while( dots != R_NilValue ){
- SEXP prom = CAR(dots) ;
-
- while(true){
- SEXP code = PRCODE(prom) ;
- if( TYPEOF(code) != PROMSXP ){
- break ;
- }
- prom = code ;
- }
- SEXP x = PRVALUE(prom) ;
- if( x == R_UnboundValue ){
- x = PROTECT(Rf_eval(PRCODE(prom), PRENV(prom))) ;
- np++ ;
- }
- if( is<T>(x) ){
- data.push_back( as<T>(x) ) ;
- }
- dots = CDR(dots) ;
- }
- if(np) UNPROTECT(np) ;
- }
-
- } ;
-
- class Dots {
- public:
-
- Dots( Environment env ) : data() {
- SEXP dots = env.find( "..." );
- if( dots != R_MissingArg ) collect(dots) ;
- }
-
- inline SEXP operator[](int i){
- return data[i] ;
- }
-
- inline int size() const{
- return data.size();
- }
-
- inline int names() const{
- return data.names();
- }
-
- private:
- List data ;
-
- void collect( SEXP dots){
- int np = 0 ;
-
- while( dots != R_NilValue ){
- SEXP prom = CAR(dots) ;
-
- while(true){
- SEXP code = PRCODE(prom) ;
- if( TYPEOF(code) != PROMSXP ){
- break ;
- }
- prom = code ;
- }
- SEXP x = PRVALUE(prom) ;
- if( x == R_UnboundValue ){
- x = PROTECT(Rf_eval(PRCODE(prom), PRENV(prom))) ;
- np++ ;
- }
- data.push_back(x) ;
- dots = CDR(dots) ;
- }
- if(np) UNPROTECT(np) ;
- }
-
- } ;
-
-
-}
-#endif
diff --git a/inst/include/tools/Encoding.h b/inst/include/tools/Encoding.h
deleted file mode 100644
index 5bfa775..0000000
--- a/inst/include/tools/Encoding.h
+++ /dev/null
@@ -1,74 +0,0 @@
-#ifndef DPLYR_ENCODING_H
-#define DPLYR_ENCODING_H
-
-#define TYPE_BITS 5
-#define BYTES_MASK (1<<1)
-#define LATIN1_MASK (1<<2)
-#define UTF8_MASK (1<<3)
-
-// that bit seems unused by R. Just using it to mark
-// objects as Shrinkable Vectors
-// that is useful for things like summarise(list(x)) where x is a
-// variable from the data, because the SEXP that goes into the list
-// is the shrinkable vector, we use this information to duplicate
-// it if needed. See the maybe_copy method in DelayedProcessor
-#define DPLYR_SHRINKABLE_MASK (1<<8)
-
-struct sxpinfo_struct {
- SEXPTYPE type : TYPE_BITS;/* ==> (FUNSXP == 99) %% 2^5 == 3 == CLOSXP
- * -> warning: `type' is narrower than values
- * of its type
- * when SEXPTYPE was an enum */
- unsigned int obj : 1;
- unsigned int named : 2;
- unsigned int gp : 16;
- unsigned int mark : 1;
- unsigned int debug : 1;
- unsigned int trace : 1; /* functions and memory tracing */
- unsigned int spare : 1; /* currently unused */
- unsigned int gcgen : 1; /* old generation number */
- unsigned int gccls : 3; /* node class */
-}; /* Tot: 32 */
-
-#ifndef IS_BYTES
-# define IS_BYTES(x) (reinterpret_cast<sxpinfo_struct*>(x)->gp & BYTES_MASK)
-#endif
-
-#ifndef IS_LATIN1
-# define IS_LATIN1(x) (reinterpret_cast<sxpinfo_struct*>(x)->gp & LATIN1_MASK)
-#endif
-
-#ifndef IS_ASCII
-# define IS_ASCII(x) (reinterpret_cast<sxpinfo_struct*>(x)->gp & ASCII_MASK)
-#endif
-
-#ifndef IS_UTF8
-# define IS_UTF8(x) (reinterpret_cast<sxpinfo_struct*>(x)->gp & UTF8_MASK)
-#endif
-
-namespace dplyr{
-
- enum encoding {
- BYTES, LATIN1, UTF8, UNKNOWN
- } ;
-
- inline encoding get_encoding( SEXP s){
- if( IS_BYTES(s) ) return BYTES ;
- if( IS_LATIN1(s) ) return LATIN1 ;
- if( IS_UTF8(s) ) return UTF8 ;
- return UNKNOWN ;
- }
-
- inline const char* human_readable_encoding( encoding e ){
- switch(e){
- case BYTES: return "bytes" ;
- case LATIN1: return "latin1" ;
- case UTF8: return "UTF-8" ;
- default: break ;
- }
- return "unknown" ;
- }
-
-}
-
-#endif
diff --git a/inst/include/tools/FilteredListOf.h b/inst/include/tools/FilteredListOf.h
deleted file mode 100644
index 2d71895..0000000
--- a/inst/include/tools/FilteredListOf.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef dplyr_tools_FilteredListOf_H
-#define dplyr_tools_FilteredListOf_H
-
-namespace Rcpp {
-
- template <typename T>
- class FilteredListOf {
- public:
-
- FilteredListOf(SEXP data_) : data(data_){
- int n = data.size() ;
- for( int i=0; i<n; i++){
- indices.push_back(i) ;
- }
- }
-
- T operator[](int i) const {
- return as<T>( data[indices[i]]) ;
- }
-
- int size() const {
- return indices.size() ;
- }
-
- private:
- List data ;
- std::vector<int> indices ;
- } ;
-}
-
-#endif
diff --git a/inst/include/tools/LazyDots.h b/inst/include/tools/LazyDots.h
deleted file mode 100644
index b9287ae..0000000
--- a/inst/include/tools/LazyDots.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef dplyr__Lazy_h
-#define dplyr__Lazy_h
-
-namespace Rcpp {
-
- class Lazy {
- public:
- Lazy( List data_, SEXP name__ ) :
- data(data_),
- name_(name__)
- {}
-
- Lazy( const Lazy& other ) :
- data(other.data),
- name_(other.name_)
- {}
-
- inline SEXP expr() const {
- return Rf_duplicate(data[0]) ;
- }
- inline SEXP env() const {
- return data[1];
- }
- inline SEXP name() const {
- return name_ ;
- }
-
- private:
-
- List data ;
- SEXP name_ ;
- } ;
-
- template <>
- inline bool is<Lazy>(SEXP x){
- return TYPEOF(x) == VECSXP &&
- Rf_length(x) == 2 &&
- Rf_inherits(x, "lazy") &&
- TYPEOF(VECTOR_ELT(x,1)) == ENVSXP
- ;
- }
-
- class LazyDots {
- public:
- LazyDots( List data_ ) : data(){
- int n = data_.size() ;
- if (n == 0) return;
-
- CharacterVector names = data_.names() ;
- for(int i=0; i<n; i++){
- List x = data_[i] ;
- if( !is<Lazy>(x) ){
- stop( "corrupt lazy object" );
- }
- data.push_back(Lazy(x, names[i])) ;
- }
- }
-
- inline const Lazy& operator[](int i) const {
- return data[i] ;
- }
-
- inline int size() const {
- return data.size() ;
- }
-
- inline bool single_env() const {
- if( data.size() <= 1 ) return true ;
- SEXP env = data[0].env() ;
- for( size_t i=1; i<data.size(); i++){
- if( data[i].env() != env ) return false ;
- }
- return true ;
- }
-
- private:
- std::vector<Lazy> data ;
- } ;
-
-}
-#endif
diff --git a/inst/include/tools/Quosure.h b/inst/include/tools/Quosure.h
new file mode 100644
index 0000000..7fd962e
--- /dev/null
+++ b/inst/include/tools/Quosure.h
@@ -0,0 +1,125 @@
+#ifndef dplyr__Quosure_h
+#define dplyr__Quosure_h
+
+#include <tools/SymbolString.h>
+#include "SymbolVector.h"
+
+
+namespace dplyr {
+
+inline SEXP quosure(SEXP expr, SEXP env) {
+ Language quo("~", expr);
+ quo.attr(".Environment") = env;
+ quo.attr("class") = CharacterVector("formula");
+ return quo;
+}
+
+
+class NamedQuosure {
+public:
+ NamedQuosure(SEXP data_, SymbolString name__ = "") :
+ data(data_),
+ name_(name__)
+ {}
+ NamedQuosure(const Formula& data_, SymbolString name__ = "") :
+ data(data_),
+ name_(name__)
+ {}
+ NamedQuosure(const NamedQuosure& other) :
+ data(other.data),
+ name_(other.name_)
+ {}
+
+ SEXP expr() const {
+ return Rf_duplicate(CADR(data));
+ }
+ SEXP env() const {
+ static SEXP sym_dotenv = Rf_install(".Environment");
+ return Rf_getAttrib(data, sym_dotenv);
+ }
+ SymbolString name() const {
+ return name_;
+ }
+
+private:
+ Formula data;
+ SymbolString name_;
+};
+
+} // namespace dplyr
+
+
+namespace Rcpp {
+
+using namespace dplyr;
+
+template <>
+inline bool is<NamedQuosure>(SEXP x) {
+ bool is_tilde =
+ TYPEOF(x) == LANGSXP &&
+ Rf_length(x) == 2 &&
+ CAR(x) == Rf_install("~");
+
+ SEXP env = Rf_getAttrib(x, Rf_install(".Environment"));
+ bool has_env = TYPEOF(env) == ENVSXP;
+
+ return is_tilde && has_env;
+}
+
+} // namespace Rcpp
+
+
+namespace dplyr {
+
+class QuosureList {
+public:
+ QuosureList(const List& data_) : data() {
+ int n = data_.size();
+ if (n == 0) return;
+
+ CharacterVector names = data_.names();
+ for (int i = 0; i < n; i++) {
+ SEXP x = data_[i];
+
+ if (!is<NamedQuosure>(x)) {
+ stop("corrupt tidy quote");
+ }
+
+ data.push_back(NamedQuosure(x, SymbolString(names[i])));
+ }
+ }
+
+ const NamedQuosure& operator[](int i) const {
+ return data[i];
+ }
+
+ int size() const {
+ return data.size();
+ }
+
+ bool single_env() const {
+ if (data.size() <= 1) return true;
+ SEXP env = data[0].env();
+ for (size_t i = 1; i < data.size(); i++) {
+ if (data[i].env() != env) return false;
+ }
+ return true;
+ }
+
+ SymbolVector names() const {
+ CharacterVector out(data.size());
+
+ for (size_t i = 0; i < data.size(); ++i) {
+ out[i] = data[i].name().get_string();
+ }
+
+ return SymbolVector(out);
+ }
+
+private:
+ std::vector<NamedQuosure> data;
+};
+
+} // namespace dplyr
+
+#endif
diff --git a/inst/include/tools/ShrinkableVector.h b/inst/include/tools/ShrinkableVector.h
index 5f897db..9bdcbe8 100644
--- a/inst/include/tools/ShrinkableVector.h
+++ b/inst/include/tools/ShrinkableVector.h
@@ -1,57 +1,56 @@
#ifndef dplyr_ShrinkableVector_H
#define dplyr_ShrinkableVector_H
-#define IS_DPLYR_SHRINKABLE_VECTOR(x) (reinterpret_cast<sxpinfo_struct*>(x)->gp & DPLYR_SHRINKABLE_MASK)
-#define SET_DPLYR_SHRINKABLE_VECTOR(x) (reinterpret_cast<sxpinfo_struct*>(x)->gp |= DPLYR_SHRINKABLE_MASK)
-#define UNSET_DPLYR_SHRINKABLE_VECTOR(x) (reinterpret_cast<sxpinfo_struct*>(x)->gp &= (~DPLYR_SHRINKABLE_MASK) )
+#include <tools/encoding.h>
+#include <tools/utils.h>
namespace Rcpp {
- template <int RTYPE>
- class ShrinkableVector {
- public:
- typedef typename traits::storage_type<RTYPE>::type STORAGE ;
-
- ShrinkableVector( int n, SEXP origin ) :
- data( no_init(n) ), max_size(n), start( internal::r_vector_start<RTYPE>(data) ), gp(LEVELS(data))
- {
- copy_most_attributes(data, origin) ;
- SET_DPLYR_SHRINKABLE_VECTOR( (SEXP)data ) ;
- }
-
- inline void resize( int n){
- SETLENGTH( data, n ) ;
- }
-
- inline operator SEXP() const {
- return data ;
- }
-
- inline void borrow(const SlicingIndex& indices, STORAGE* begin){
- int n = indices.size() ;
- for( int i=0; i<n ; i++){
- start[i] = begin[indices[i]] ;
- }
- SETLENGTH(data, n) ;
- }
-
- ~ShrinkableVector(){
- // restore the initial length so that R can reclaim the memory
- SETLENGTH( data, max_size );
- UNSET_DPLYR_SHRINKABLE_VECTOR((SEXP)data);
- }
-
- private:
- Rcpp::Vector<RTYPE> data ;
- int max_size ;
- STORAGE* start ;
- unsigned short gp ;
-
- } ;
-
- inline bool is_ShrinkableVector( SEXP x){
- return IS_DPLYR_SHRINKABLE_VECTOR(x) ;
+template <int RTYPE>
+class ShrinkableVector {
+public:
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+
+ ShrinkableVector(int n, SEXP origin) :
+ data(no_init(n)), max_size(n), start(internal::r_vector_start<RTYPE>(data)), gp(LEVELS(data))
+ {
+ copy_most_attributes(data, origin);
+ SET_DPLYR_SHRINKABLE_VECTOR((SEXP)data);
+ }
+
+ inline void resize(int n) {
+ SETLENGTH(data, n);
+ }
+
+ inline operator SEXP() const {
+ return data;
+ }
+
+ inline void borrow(const SlicingIndex& indices, STORAGE* begin) {
+ int n = indices.size();
+ for (int i = 0; i < n; i++) {
+ start[i] = begin[indices[i]];
}
+ SETLENGTH(data, n);
+ }
+
+ ~ShrinkableVector() {
+ // restore the initial length so that R can reclaim the memory
+ SETLENGTH(data, max_size);
+ UNSET_DPLYR_SHRINKABLE_VECTOR((SEXP)data);
+ }
+
+private:
+ Rcpp::Vector<RTYPE> data;
+ int max_size;
+ STORAGE* start;
+ unsigned short gp;
+
+};
+
+inline bool is_ShrinkableVector(SEXP x) {
+ return IS_DPLYR_SHRINKABLE_VECTOR(x);
+}
}
diff --git a/inst/include/tools/SlicingIndex.h b/inst/include/tools/SlicingIndex.h
index 6208183..020a1f8 100644
--- a/inst/include/tools/SlicingIndex.h
+++ b/inst/include/tools/SlicingIndex.h
@@ -1,31 +1,121 @@
#ifndef dplyr_tools_SlicingIndex_H
#define dplyr_tools_SlicingIndex_H
+// A SlicingIndex allows specifying which rows of a data frame are selected in which order, basically a 0:n -> 0:m map.
+// It also can be used to split a data frame in groups.
+// Important special cases can be implemented without materializing the map.
class SlicingIndex {
public:
+ virtual int size() const = 0;
+ virtual int operator[](int i) const = 0;
+ virtual int group() const = 0;
+ virtual bool is_identity(SEXP) const {
+ return FALSE;
+ };
+};
- SlicingIndex(IntegerVector data_) : data(data_), group_index(-1) {}
- SlicingIndex(IntegerVector data_, int group_) : data(data_), group_index(group_) {}
+// A GroupedSlicingIndex is the most general slicing index,
+// the 0:n -> 0:m map is specified and stored as an IntegerVector.
+// A group identifier can be assigned on construction.
+// It is used in grouped operations (group_by()).
+class GroupedSlicingIndex : public SlicingIndex {
+public:
+ GroupedSlicingIndex(IntegerVector data_) : data(data_), group_index(-1) {}
+ GroupedSlicingIndex(IntegerVector data_, int group_) : data(data_), group_index(group_) {}
+
+ virtual int size() const {
+ return data.size();
+ }
+
+ virtual int operator[](int i) const {
+ return data[i];
+ }
+
+ virtual int group() const {
+ return group_index;
+ }
+
+private:
+ IntegerVector data;
+ int group_index;
+};
+
+// A RowwiseSlicingIndex selects a single row, which is also the group ID by definition.
+// It is used in rowwise operations (rowwise()).
+class RowwiseSlicingIndex : public SlicingIndex {
+public:
+ RowwiseSlicingIndex(const int start_) : start(start_) {}
+
+ inline int size() const {
+ return 1;
+ }
+
+ inline int operator[](int i) const {
+ if (i != 0)
+ stop("Can only use 0 for RowwiseSlicingIndex, queried %d", i);
+ return start;
+ }
+
+ inline int group() const {
+ return start;
+ }
- SlicingIndex(int start, int n) : data(0), group_index(-1) {
- if(n>0) {
- data = seq(start, start + n - 1 ) ;
- }
- }
+private:
+ int start;
+};
+
+// A NaturalSlicingIndex selects an entire data frame as a single group.
+// It is used when the entire data frame needs to be processed by a processor that expects a SlicingIndex
+// to address the rows.
+class NaturalSlicingIndex : public SlicingIndex {
+public:
+ NaturalSlicingIndex(const int n_) : n(n_) {}
+
+ virtual int size() const {
+ return n;
+ }
+
+ virtual int operator[](int i) const {
+ if (i < 0 || i >= n)
+ stop("Out of bounds index %d queried for NaturalSlicingIndex", i);
+ return i;
+ }
+
+ virtual int group() const {
+ return -1;
+ }
+
+ virtual bool is_identity(SEXP x) const {
+ const R_len_t length = Rf_length(x);
+ return length == n;
+ }
+
+private:
+ int n;
+};
+
+// An OffsetSlicingIndex selects a consecutive part of a data frame, starting at a specific row.
+// It is used for binding data frames vertically (bind_rows()).
+class OffsetSlicingIndex : public SlicingIndex {
+public:
+ OffsetSlicingIndex(const int start_, const int n_) : start(start_), n(n_) {}
- inline int size() const {
- return data.size() ;
- }
+ inline int size() const {
+ return n;
+ }
- inline int operator[](int i) const {
- return data[i] ;
- }
+ inline int operator[](int i) const {
+ if (i < 0 || i >= n)
+ stop("Out of bounds index %d queried for OffsetSlicingIndex", i);
+ return i + start;
+ }
- inline int group() const { return group_index ; }
+ inline int group() const {
+ return -1;
+ }
-// private:
- IntegerVector data ;
- int group_index ;
-} ;
+private:
+ int start, n;
+};
#endif
diff --git a/inst/include/tools/StringUTF8.h b/inst/include/tools/StringUTF8.h
deleted file mode 100644
index e8c2e83..0000000
--- a/inst/include/tools/StringUTF8.h
+++ /dev/null
@@ -1,58 +0,0 @@
-#ifndef DPLYR_STRINGUTF8_H
-#define DPLYR_STRINGUTF8_H
-
-namespace dplyr {
-
- class StringUtf8 {
- public:
- StringUtf8( SEXP s_ ) : s(Rf_mkCharCE(Rf_translateCharUTF8(s_), CE_UTF8)) {}
- StringUtf8( CharacterVector::Proxy s_ ) : s(Rf_mkCharCE(Rf_translateCharUTF8(s_), CE_UTF8)) {}
-
- inline operator SEXP() const {
- return s ;
- }
-
- private:
- SEXP s ;
- } ;
-
- class CharacterVectorUtf8 {
- public:
- CharacterVectorUtf8( SEXP v ) : data(v){
- int n = data.size() ;
- // move on to the first non UTF-8 string, if any
-
- int i = 0 ;
- for( ; i<n; i++){
- cetype_t enc = Rf_getCharCE(data[i]) ;
- if( enc != CE_UTF8 ) break ;
- }
- if( i < n ){
- CharacterVector newdata(n) ;
- for( int j=0; j<i; j++){
- newdata[j] = data[i] ;
- }
- for( int j=i; j<n; j++){
- newdata[j] = StringUtf8(data[j]) ;
-
- }
- data = newdata ;
- }
- }
-
- inline CharacterVector::Proxy operator[](int i){ return data[i] ; }
- inline CharacterVector::const_Proxy operator[]( int i) const{ return data[i] ; }
-
- inline CharacterVector::iterator begin(){ return data.begin(); }
- inline CharacterVector::const_iterator begin() const{ return data.begin(); }
-
- inline CharacterVector::iterator end(){ return data.end(); }
- inline CharacterVector::const_iterator end() const{ return data.end(); }
-
- private:
- CharacterVector data ;
- } ;
-
-}
-
-#endif
diff --git a/inst/include/tools/SymbolMap.h b/inst/include/tools/SymbolMap.h
index fa14c93..b517e7a 100644
--- a/inst/include/tools/SymbolMap.h
+++ b/inst/include/tools/SymbolMap.h
@@ -1,125 +1,121 @@
#ifndef dplyr_tools_SymbolMap_h
#define dplyr_tools_SymbolMap_h
-namespace dplyr{
-
- enum Origin { HASH, RMATCH, NEW } ;
-
- struct SymbolMapIndex {
- int pos ;
- Origin origin ;
-
- SymbolMapIndex( int pos_, Origin origin_ ) :
- pos(pos_), origin(origin_)
- {}
- } ;
-
- class SymbolMap {
- public:
- SymbolMap(): lookup(), names(){}
-
- SymbolMapIndex insert( SEXP name ){
- if( TYPEOF(name) == SYMSXP ) {
- name = PRINTNAME(name) ;
- }
- SymbolMapIndex index = get_index(name) ;
- int idx = index.pos ;
- switch( index.origin ){
- case HASH:
- break;
- case RMATCH:
- lookup.insert( std::make_pair(name, idx ) ) ;
- break ;
- case NEW:
- names.push_back(name) ;
- lookup.insert( std::make_pair(name, idx ) ) ;
- break ;
- } ;
- return index ;
- }
-
- int size() const {
- return names.size() ;
- }
-
- bool has( SEXP name ) const {
- if( TYPEOF(name) == SYMSXP ) {
- name = PRINTNAME(name) ;
- }
- SymbolMapIndex index = get_index(name) ;
- return index.origin != NEW ;
- }
-
- SymbolMapIndex get_index(SEXP name) const {
- if( TYPEOF(name) == SYMSXP ) {
- name = PRINTNAME(name) ;
- }
-
- // first, lookup the map
- dplyr_hash_map<SEXP, int>::const_iterator it = lookup.find(name) ;
- if( it != lookup.end() ){
- return SymbolMapIndex( it->second, HASH ) ;
- }
-
- CharacterVector v = CharacterVector::create(name) ;
- int idx = as<int>( r_match( v, names ) );
- if( idx != NA_INTEGER ){
- // we have a match
- return SymbolMapIndex( idx-1, RMATCH ) ;
- }
-
- // no match
- return SymbolMapIndex( names.size(), NEW ) ;
- }
-
- int get( SEXP name ) const {
- if( TYPEOF(name) == SYMSXP ) {
- name = PRINTNAME(name) ;
- }
- SymbolMapIndex index = get_index(name) ;
- if( index.origin == NEW ){
- stop( "variable '%s' not found", CHAR(name) ) ;
- }
- return index.pos ;
- }
-
- SymbolMapIndex rm( SEXP name ){
- if( TYPEOF(name) == SYMSXP ) {
- name = PRINTNAME(name) ;
- }
- SymbolMapIndex index = get_index(name) ;
- if( index.origin != NEW ){
- int idx = index.pos ;
- names.erase( names.begin() + idx ) ;
-
- for( dplyr_hash_map<SEXP, int>::iterator it=lookup.begin(); it != lookup.end(); ){
- int k = it->second ;
-
- if( k < idx ) {
- // nothing to do in that case
- ++it ;
- continue ;
- } else if( k == idx ){
- // need to remove the data from the hash table
- it = lookup.erase(it) ;
- continue ;
- } else {
- // decrement the index
- it->second-- ;
- ++it ;
- }
- }
-
- }
-
- return index ;
- }
-
-
- dplyr_hash_map<SEXP, int> lookup ;
- CharacterVector names ;
-
- } ;
+#include <tools/hash.h>
+#include <tools/match.h>
+
+namespace dplyr {
+
+enum Origin { HASH, RMATCH, NEW };
+
+struct SymbolMapIndex {
+ int pos;
+ Origin origin;
+
+ SymbolMapIndex(int pos_, Origin origin_) :
+ pos(pos_), origin(origin_)
+ {}
+};
+
+class SymbolMap {
+private:
+ dplyr_hash_map<SEXP, int> lookup;
+ SymbolVector names;
+
+public:
+ SymbolMap(): lookup(), names() {}
+
+ SymbolMap(const SymbolVector& names_): lookup(), names(names_) {}
+
+ SymbolMapIndex insert(const SymbolString& name) {
+ SymbolMapIndex index = get_index(name);
+ int idx = index.pos;
+ switch (index.origin) {
+ case HASH:
+ break;
+ case RMATCH:
+ lookup.insert(std::make_pair(name.get_sexp(), idx));
+ break;
+ case NEW:
+ names.push_back(name.get_string());
+ lookup.insert(std::make_pair(name.get_sexp(), idx));
+ break;
+ };
+ return index;
+ }
+
+ SymbolVector get_names() const {
+ return names;
+ }
+
+ SymbolString get_name(const int i) const {
+ return names[i];
+ }
+
+ int size() const {
+ return names.size();
+ }
+
+ bool has(const SymbolString& name) const {
+ SymbolMapIndex index = get_index(name);
+ return index.origin != NEW;
+ }
+
+ SymbolMapIndex get_index(const SymbolString& name) const {
+ // first, lookup the map
+ dplyr_hash_map<SEXP, int>::const_iterator it = lookup.find(name.get_sexp());
+ if (it != lookup.end()) {
+ return SymbolMapIndex(it->second, HASH);
+ }
+
+ int idx = names.match(name);
+ if (idx != NA_INTEGER) {
+ // we have a match
+ return SymbolMapIndex(idx - 1, RMATCH);
+ }
+
+ // no match
+ return SymbolMapIndex(names.size(), NEW);
+ }
+
+ int get(const SymbolString& name) const {
+ SymbolMapIndex index = get_index(name);
+ if (index.origin == NEW) {
+ stop("variable '%s' not found", name.get_utf8_cstring());
+ }
+ return index.pos;
+ }
+
+ SymbolMapIndex rm(const SymbolString& name) {
+ SymbolMapIndex index = get_index(name);
+ if (index.origin != NEW) {
+ int idx = index.pos;
+ names.remove(idx);
+
+ for (dplyr_hash_map<SEXP, int>::iterator it = lookup.begin(); it != lookup.end();) {
+ int k = it->second;
+
+ if (k < idx) {
+ // nothing to do in that case
+ ++it;
+ continue;
+ } else if (k == idx) {
+ // need to remove the data from the hash table
+ it = lookup.erase(it);
+ continue;
+ } else {
+ // decrement the index
+ it->second--;
+ ++it;
+ }
+ }
+
+ }
+
+ return index;
+ }
+
+};
}
diff --git a/inst/include/tools/SymbolString.h b/inst/include/tools/SymbolString.h
new file mode 100644
index 0000000..0f02109
--- /dev/null
+++ b/inst/include/tools/SymbolString.h
@@ -0,0 +1,57 @@
+#ifndef dplyr_tools_SymbolString_h
+#define dplyr_tools_SymbolString_h
+
+#include <tools/encoding.h>
+
+namespace dplyr {
+
+class SymbolString {
+public:
+ SymbolString() {}
+
+ SymbolString(const char* str) : s(str) {}
+
+ SymbolString(const String& other) : s(other) {}
+
+ SymbolString(const String::StringProxy& other) : s(other) {}
+
+ SymbolString(const String::const_StringProxy& other) : s(other) {}
+
+ // Symbols are always encoded in the native encoding (#1950)
+ explicit SymbolString(const Symbol& symbol) : s(CHAR(PRINTNAME(symbol)), CE_NATIVE) {}
+
+public:
+ const String& get_string() const {
+ return s;
+ }
+
+ const Symbol get_symbol() const {
+ return Symbol(Rf_translateChar(s.get_sexp()));
+ }
+
+ const std::string get_utf8_cstring() const {
+ static Environment rlang = Environment::namespace_env("rlang");
+ static Function as_string = Function("as_string", rlang);
+ SEXP utf8_string = as_string(Rf_lang2(R_QuoteSymbol, get_symbol()));
+ return CHAR(STRING_ELT(utf8_string, 0));
+ }
+
+ bool is_empty() const {
+ return s == "";
+ }
+
+ SEXP get_sexp() const {
+ return s.get_sexp();
+ }
+
+ bool operator==(const SymbolString& other) const {
+ return Rf_NonNullStringMatch(get_sexp(), other.get_sexp());
+ }
+
+private:
+ String s;
+};
+
+}
+
+#endif
diff --git a/inst/include/tools/SymbolVector.h b/inst/include/tools/SymbolVector.h
new file mode 100644
index 0000000..1153cdb
--- /dev/null
+++ b/inst/include/tools/SymbolVector.h
@@ -0,0 +1,78 @@
+#ifndef dplyr_tools_SymbolVector_h
+#define dplyr_tools_SymbolVector_h
+
+#include <tools/SymbolString.h>
+#include <tools/match.h>
+
+namespace dplyr {
+
+class SymbolVector {
+public:
+ SymbolVector() {}
+
+ template <class T>
+ explicit SymbolVector(T v_) : v(v_) {}
+
+ explicit SymbolVector(SEXP x) : v(init(x)) {}
+ explicit SymbolVector(RObject x) : v(init(x)) {}
+
+public:
+ void push_back(const SymbolString& s) {
+ v.push_back(s.get_string());
+ }
+
+ void remove(const R_xlen_t idx) {
+ v.erase(v.begin() + idx);
+ }
+
+ const SymbolString operator[](const R_xlen_t i) const {
+ return SymbolString(v[i]);
+ }
+
+ void set(int i, const SymbolString& x) {
+ v[i] = x.get_string();
+ }
+
+ R_xlen_t size() const {
+ return v.size();
+ }
+
+ int match(const SymbolString& s) const {
+ CharacterVector vs = CharacterVector::create(s.get_string());
+ return as<int>(match(vs));
+ }
+
+ const IntegerVector match(const CharacterVector& m) const {
+ return r_match(m, v);
+ }
+
+ const IntegerVector match_in_table(const CharacterVector& t) const {
+ return r_match(v, t);
+ }
+
+ const CharacterVector get_vector() const {
+ return v;
+ }
+
+private:
+ CharacterVector v;
+ SEXP init(SEXP x) {
+ if (Rf_isNull(x))
+ return CharacterVector();
+ else
+ return x;
+ }
+};
+
+}
+
+namespace Rcpp {
+using namespace dplyr;
+
+template <> inline SEXP wrap(const SymbolVector& x) {
+ return x.get_vector();
+}
+
+}
+
+#endif
diff --git a/inst/include/tools/all_na.h b/inst/include/tools/all_na.h
index 956478e..3e59a4c 100644
--- a/inst/include/tools/all_na.h
+++ b/inst/include/tools/all_na.h
@@ -2,17 +2,17 @@
#define dplyr_tools_all_na_H
template <int RTYPE>
-inline bool all_na_impl( const Vector<RTYPE>& x ){
- return all( is_na(x) ).is_true() ;
+inline bool all_na_impl(const Vector<RTYPE>& x) {
+ return all(is_na(x)).is_true();
}
template <>
-inline bool all_na_impl<REALSXP>( const NumericVector& x ){
- return all( is_na(x) & !is_nan(x) ).is_true() ;
+inline bool all_na_impl<REALSXP>(const NumericVector& x) {
+ return all(is_na(x) & !is_nan(x)).is_true();
}
-inline bool all_na( SEXP x ){
- RCPP_RETURN_VECTOR( all_na_impl, x ) ;
+inline bool all_na(SEXP x) {
+ RCPP_RETURN_VECTOR(all_na_impl, x);
}
#endif
diff --git a/inst/include/tools/collapse.h b/inst/include/tools/collapse.h
index b0c2be5..aa02108 100644
--- a/inst/include/tools/collapse.h
+++ b/inst/include/tools/collapse.h
@@ -1,28 +1,28 @@
#ifndef dplyr_collapse_H
#define dplyr_collapse_H
-namespace Rcpp {
+namespace dplyr {
- template <int RTYPE>
- const char* toString( typename ::Rcpp::traits::storage_type<RTYPE>::type from){
- SEXP s = internal::r_coerce<RTYPE,STRSXP>( from ) ;
- return CHAR(s) ;
- }
-
- template <int RTYPE>
- std::string collapse( const Vector<RTYPE>& x, const char* sep = ", " ){
- std::stringstream ss;
- int n = x.size() ;
- if( n > 0){
- ss << toString<RTYPE>(x[0]) ;
- for( int i=1; i<n; i++) {
- const char* st = toString<RTYPE>(x[i]) ;
- ss << sep << st ;
- }
- }
+template <int RTYPE>
+const char* to_string_utf8(typename Rcpp::traits::storage_type<RTYPE>::type from) {
+ SEXP s = Rcpp::internal::r_coerce<RTYPE, STRSXP>(from);
+ return Rf_translateCharUTF8(s);
+}
- return ss.str();
+template <int RTYPE>
+std::string collapse_utf8(const Vector<RTYPE>& x, const char* sep = ", ", const char* quote = "") {
+ std::stringstream ss;
+ int n = x.size();
+ if (n > 0) {
+ ss << quote << to_string_utf8<RTYPE>(x[0]) << quote;
+ for (int i = 1; i < n; i++) {
+ const char* st = to_string_utf8<RTYPE>(x[i]);
+ ss << sep << quote << st << quote;
}
+ }
+
+ return ss.str();
+}
}
#endif
diff --git a/inst/include/tools/complex.h b/inst/include/tools/complex.h
deleted file mode 100644
index eb5eccc..0000000
--- a/inst/include/tools/complex.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef dplyr_tools_complex_H
-#define dplyr_tools_complex_H
-
-#if RCPP_VERSION < Rcpp_Version(0,12,2)
-inline std::ostream & operator<<(std::ostream &os, const Rcomplex& cplx ){
- return os << cplx.r << "+" << cplx.i << "i" ;
-}
-#endif
-
-#endif
diff --git a/inst/include/tools/debug.h b/inst/include/tools/debug.h
new file mode 100644
index 0000000..2c2331a
--- /dev/null
+++ b/inst/include/tools/debug.h
@@ -0,0 +1,13 @@
+#ifndef dplyr_tools_debug_H
+#define dplyr_tools_debug_H
+
+// borrowed from Rcpp11
+#ifndef RCPP_DEBUG_OBJECT
+#define RCPP_DEBUG_OBJECT(OBJ) Rf_PrintValue( Rf_eval( Rf_lang2( Rf_install( "str"), OBJ ), R_GlobalEnv ) );
+#endif
+
+#ifndef RCPP_INSPECT_OBJECT
+#define RCPP_INSPECT_OBJECT(OBJ) Rf_PrintValue( Rf_eval( Rf_lang2( Rf_install( ".Internal"), Rf_lang2( Rf_install( "inspect" ), OBJ ) ), R_GlobalEnv ) );
+#endif
+
+#endif // #ifndef dplyr_tools_debug_H
diff --git a/inst/include/tools/delete_all.h b/inst/include/tools/delete_all.h
deleted file mode 100644
index 7a7013e..0000000
--- a/inst/include/tools/delete_all.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef dplyr_tools_delete_all_H
-#define dplyr_tools_delete_all_H
-
-namespace dplyr {
-
- template <typename T>
- void delete_all_second( T& value ){
- for( typename T::iterator it=value.begin(); it!=value.end(); ++it) {
- delete it->second ;
- }
- value.clear() ;
- }
-
-}
-
-#endif
diff --git a/inst/include/tools/encoding.h b/inst/include/tools/encoding.h
new file mode 100644
index 0000000..7f9eb6c
--- /dev/null
+++ b/inst/include/tools/encoding.h
@@ -0,0 +1,65 @@
+#ifndef DPLYR_ENCODING_H
+#define DPLYR_ENCODING_H
+
+#define TYPE_BITS 5
+#define BYTES_MASK (1<<1)
+#define LATIN1_MASK (1<<2)
+#define UTF8_MASK (1<<3)
+#define ASCII_MASK (1<<6)
+
+struct sxpinfo_struct {
+ // *INDENT-OFF*
+ SEXPTYPE type : TYPE_BITS;/* ==> (FUNSXP == 99) %% 2^5 == 3 == CLOSXP
+ * -> warning: `type' is narrower than values
+ * of its type
+ * when SEXPTYPE was an enum */
+ // *INDENT-ON*
+ unsigned int obj : 1;
+ unsigned int named : 2;
+ unsigned int gp : 16;
+ unsigned int mark : 1;
+ unsigned int debug : 1;
+ unsigned int trace : 1; /* functions and memory tracing */
+ unsigned int spare : 1; /* currently unused */
+ unsigned int gcgen : 1; /* old generation number */
+ unsigned int gccls : 3; /* node class */
+}; /* Tot: 32 */
+
+#ifndef IS_BYTES
+#define IS_BYTES(x) (reinterpret_cast<sxpinfo_struct*>(x)->gp & BYTES_MASK)
+#endif
+
+#ifndef IS_LATIN1
+#define IS_LATIN1(x) (reinterpret_cast<sxpinfo_struct*>(x)->gp & LATIN1_MASK)
+#endif
+
+#ifndef IS_ASCII
+#define IS_ASCII(x) (reinterpret_cast<sxpinfo_struct*>(x)->gp & ASCII_MASK)
+#endif
+
+#ifndef IS_UTF8
+#define IS_UTF8(x) (reinterpret_cast<sxpinfo_struct*>(x)->gp & UTF8_MASK)
+#endif
+
+// that bit seems unused by R. Just using it to mark
+// objects as Shrinkable Vectors
+// that is useful for things like summarise(list(x)) where x is a
+// variable from the data, because the SEXP that goes into the list
+// is the shrinkable vector, we use this information to duplicate
+// it if needed. See the maybe_copy method in DelayedProcessor
+#define DPLYR_SHRINKABLE_MASK (1<<8)
+
+#define IS_DPLYR_SHRINKABLE_VECTOR(x) (reinterpret_cast<sxpinfo_struct*>(x)->gp & DPLYR_SHRINKABLE_MASK)
+#define SET_DPLYR_SHRINKABLE_VECTOR(x) (reinterpret_cast<sxpinfo_struct*>(x)->gp |= DPLYR_SHRINKABLE_MASK)
+#define UNSET_DPLYR_SHRINKABLE_VECTOR(x) (reinterpret_cast<sxpinfo_struct*>(x)->gp &= (~DPLYR_SHRINKABLE_MASK) )
+
+
+namespace dplyr {
+
+CharacterVector reencode_factor(IntegerVector x);
+CharacterVector reencode_char(SEXP x);
+
+}
+
+
+#endif
diff --git a/inst/include/tools/get_all_second.h b/inst/include/tools/get_all_second.h
deleted file mode 100644
index 393958b..0000000
--- a/inst/include/tools/get_all_second.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef dplyr_get_all_second_H
-#define dplyr_get_all_second_H
-
-namespace dplyr {
-
- template <typename Map>
- List get_all_second( const Map& map){
- int ngroups = map.size() ;
- List res(ngroups);
- typename Map::const_iterator it=map.begin() ;
- for( int i=0; i<ngroups; i++, ++it)
- res[i] = it->second ;
- return res ;
- }
-
-}
-
-#endif
diff --git a/inst/include/tools/get_single_class.h b/inst/include/tools/get_single_class.h
deleted file mode 100644
index c2321ba..0000000
--- a/inst/include/tools/get_single_class.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef dplyr_get_single_class_h
-#define dplyr_get_single_class_h
-
-namespace dplyr {
-
- inline std::string get_single_class(SEXP x){
- SEXP klass = Rf_getAttrib(x, R_ClassSymbol) ;
- if( !Rf_isNull(klass) ){
- CharacterVector classes(klass) ;
- return collapse<STRSXP>(classes) ;
- }
-
- if(Rf_isMatrix(x)){
- return "matrix" ;
- }
-
- switch( TYPEOF(x) ){
- case INTSXP: return "integer" ;
- case REALSXP : return "numeric" ;
- case LGLSXP: return "logical" ;
- case STRSXP: return "character" ;
-
- case VECSXP: return "list" ;
- default: break ;
- }
-
- // just call R to deal with other cases
- // we could call R_data_class directly but we might get a "this is not part of the api"
- klass = Rf_eval( Rf_lang2( Rf_install( "class" ), x), R_GlobalEnv ) ;
- return CHAR(STRING_ELT(klass,0)) ;
- }
-
-}
-
-#endif
diff --git a/inst/include/tools/hash.h b/inst/include/tools/hash.h
index a43e5a1..20a294b 100644
--- a/inst/include/tools/hash.h
+++ b/inst/include/tools/hash.h
@@ -1,11 +1,31 @@
#ifndef dplyr_HASH_H
#define dplyr_HASH_H
-inline std::size_t hash_value(const Rcomplex& cx){
- boost::hash<double> hasher;
- size_t seed = hasher(cx.r) ;
- boost::hash_combine( seed, hasher(cx.i) ) ;
- return seed ;
+#include <boost/functional/hash.hpp>
+
+#ifndef dplyr_hash_map
+#if defined(_WIN32)
+#define dplyr_hash_map RCPP_UNORDERED_MAP
+#else
+#include <boost/unordered_map.hpp>
+#define dplyr_hash_map boost::unordered_map
+#endif
+#endif // #ifndef dplyr_hash_map
+
+#ifndef dplyr_hash_set
+#if defined(_WIN32)
+#define dplyr_hash_set RCPP_UNORDERED_SET
+#else
+#include <boost/unordered_set.hpp>
+#define dplyr_hash_set boost::unordered_set
+#endif
+#endif // #ifndef dplyr_hash_set
+
+inline std::size_t hash_value(const Rcomplex& cx) {
+ boost::hash<double> hasher;
+ size_t seed = hasher(cx.r);
+ boost::hash_combine(seed, hasher(cx.i));
+ return seed;
}
#endif
diff --git a/inst/include/tools/match.h b/inst/include/tools/match.h
index e41fdcb..6b0f63a 100644
--- a/inst/include/tools/match.h
+++ b/inst/include/tools/match.h
@@ -4,20 +4,21 @@
namespace dplyr {
-class RMatch {
-public:
- RMatch() : match_fun("match", R_BaseEnv) {}
- IntegerVector operator()(SEXP x, SEXP y) {
- return match_fun(x, y, NA_INTEGER, CharacterVector());
+inline IntegerVector r_match(SEXP x, SEXP y, SEXP incomparables = R_NilValue) {
+ static Function match("match", R_BaseEnv);
+ if (R_VERSION == R_Version(3, 3, 0)) {
+ // Work around matching bug in R 3.3.0: #1806
+ // https://bugs.r-project.org/bugzilla3/show_bug.cgi?id=16885
+ if (Rf_isNull(incomparables)) {
+ return match(x, y, NA_INTEGER, LogicalVector());
+ }
+ else {
+ return match(x, y, NA_INTEGER, incomparables);
+ }
+ }
+ else {
+ return match(x, y, NA_INTEGER, incomparables);
}
-
-private:
- Function match_fun;
-};
-
-inline IntegerVector r_match( SEXP x, SEXP y ) {
- static RMatch m;
- return m(x, y);
}
}
diff --git a/inst/include/tools/pointer_vector.h b/inst/include/tools/pointer_vector.h
index e3925ee..03b1134 100644
--- a/inst/include/tools/pointer_vector.h
+++ b/inst/include/tools/pointer_vector.h
@@ -3,43 +3,44 @@
namespace dplyr {
- template <typename T>
- class pointer_vector {
- public:
+template <typename T>
+class pointer_vector {
+public:
- typedef typename std::vector<T*> Vector ;
- typedef typename Vector::reference reference ;
- typedef typename Vector::const_reference const_reference ;
- typedef typename Vector::size_type size_type ;
- typedef typename Vector::value_type value_type ;
- typedef typename Vector::iterator iterator ;
+ typedef typename std::vector<T*> Vector;
+ typedef typename Vector::reference reference;
+ typedef typename Vector::const_reference const_reference;
+ typedef typename Vector::size_type size_type;
+ typedef typename Vector::value_type value_type;
+ typedef typename Vector::iterator iterator;
- pointer_vector() : data(){}
- pointer_vector(size_type n) : data(n){}
- inline ~pointer_vector(){
- typedef typename Vector::size_type size_type ;
- size_type n = data.size() ;
- iterator it = data.end() ; --it ;
- for( size_type i=0 ; i<n; --it, i++) delete *it ;
- }
+ pointer_vector() : data() {}
+ pointer_vector(size_type n) : data(n) {}
+ inline ~pointer_vector() {
+ typedef typename Vector::size_type size_type;
+ size_type n = data.size();
+ iterator it = data.end();
+ --it;
+ for (size_type i = 0; i < n; --it, i++) delete *it;
+ }
- inline reference operator[](size_type i){
- return data[i] ;
- }
- inline const_reference operator[](size_type i) const {
- return data[i];
- }
- inline void push_back( const value_type& value ){
- data.push_back(value);
- }
- inline size_type size() const {
- return data.size() ;
- }
+ inline reference operator[](size_type i) {
+ return data[i];
+ }
+ inline const_reference operator[](size_type i) const {
+ return data[i];
+ }
+ inline void push_back(const value_type& value) {
+ data.push_back(value);
+ }
+ inline size_type size() const {
+ return data.size();
+ }
- private:
- Vector data ;
- pointer_vector( const pointer_vector& ) ;
- } ;
+private:
+ Vector data;
+ pointer_vector(const pointer_vector&);
+};
}
#endif
diff --git a/inst/include/tools/rlang-export.h b/inst/include/tools/rlang-export.h
new file mode 100644
index 0000000..62fa226
--- /dev/null
+++ b/inst/include/tools/rlang-export.h
@@ -0,0 +1,21 @@
+#ifndef RLANG_EXPORT_H
+#define RLANG_EXPORT_H
+
+#define R_NO_REMAP
+#include <Rinternals.h>
+#include <Rversion.h>
+#include <R_ext/Rdynload.h>
+
+
+#if (defined(R_VERSION) && R_VERSION < R_Version(3, 4, 0))
+typedef union {
+ void* p;
+ DL_FUNC fn;
+} fn_ptr;
+SEXP R_MakeExternalPtrFn(DL_FUNC p, SEXP tag, SEXP prot);
+DL_FUNC R_ExternalPtrAddrFn(SEXP s);
+#endif
+
+void rlang_register_pointer(const char* ns, const char* ptr_name, DL_FUNC fn);
+
+#endif
diff --git a/inst/include/tools/scalar_type.h b/inst/include/tools/scalar_type.h
new file mode 100644
index 0000000..2a1460c
--- /dev/null
+++ b/inst/include/tools/scalar_type.h
@@ -0,0 +1,22 @@
+#ifndef DPLYR_SCALAR_TYPE_H
+#define DPLYR_SCALAR_TYPE_H
+
+namespace dplyr {
+
+namespace traits {
+
+template <int RTYPE>
+struct scalar_type {
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type type;
+};
+
+template <>
+struct scalar_type<STRSXP> {
+ typedef String type;
+};
+
+}
+
+}
+
+#endif //DPLYR_SCALAR_TYPE_H
diff --git a/inst/include/tools/tools.h b/inst/include/tools/tools.h
index ee6f81f..6b8981a 100644
--- a/inst/include/tools/tools.h
+++ b/inst/include/tools/tools.h
@@ -1,23 +1,12 @@
#ifndef dplyr_tools_tools_H
#define dplyr_tools_tools_H
-#include <tools/Encoding.h>
-#include <tools/match.h>
-#include <tools/SymbolMap.h>
-#include <tools/StringUTF8.h>
-#include <tools/complex.h>
-#include <tools/DotsOf.h>
-#include <tools/Call.h>
+#include <tools/debug.h>
#include <tools/hash.h>
-#include <tools/delete_all.h>
+#include <tools/match.h>
#include <tools/pointer_vector.h>
-#include <tools/FilteredListOf.h>
#include <tools/collapse.h>
-#include <tools/get_single_class.h>
-#include <tools/SlicingIndex.h>
-#include <tools/ShrinkableVector.h>
-#include <tools/wrap_subset.h>
-#include <tools/get_all_second.h>
-#include <tools/LazyDots.h>
+#include <tools/Quosure.h>
+#include <tools/utils.h>
#endif
diff --git a/inst/include/tools/utils.h b/inst/include/tools/utils.h
new file mode 100644
index 0000000..11a0f8f
--- /dev/null
+++ b/inst/include/tools/utils.h
@@ -0,0 +1,49 @@
+#ifndef dplyr_tools_utils_H
+#define dplyr_tools_utils_H
+
+#include <tools/SymbolVector.h>
+
+void assert_all_white_list(const DataFrame&);
+SEXP shared_SEXP(SEXP x);
+SEXP shallow_copy(const List& data);
+SEXP pairlist_shallow_copy(SEXP p);
+void copy_attributes(SEXP out, SEXP data);
+void strip_index(DataFrame x);
+SEXP null_if_empty(SEXP x);
+
+bool is_vector(SEXP x);
+bool is_atomic(SEXP x);
+
+SEXP vec_names(SEXP x);
+bool is_str_empty(SEXP str);
+bool has_name_at(SEXP x, R_len_t i);
+SEXP name_at(SEXP x, size_t i);
+
+SEXP f_env(SEXP x);
+bool is_quosure(SEXP x);
+SEXP maybe_rhs(SEXP x);
+
+
+namespace dplyr {
+
+std::string get_single_class(SEXP x);
+CharacterVector default_chars(SEXP x, R_xlen_t len);
+CharacterVector get_class(SEXP x);
+SEXP set_class(SEXP x, const CharacterVector& class_);
+CharacterVector get_levels(SEXP x);
+SEXP set_levels(SEXP x, const CharacterVector& levels);
+bool same_levels(SEXP left, SEXP right);
+bool character_vector_equal(const CharacterVector& x, const CharacterVector& y);
+
+SymbolVector get_vars(SEXP x, bool duplicate = false);
+void set_vars(SEXP x, const SymbolVector& vars);
+void copy_vars(SEXP target, SEXP source);
+
+// effectively the same as copy_attributes but without names and dims
+inline void copy_most_attributes(SEXP out, SEXP data) {
+ Rf_copyMostAttrib(data, out);
+}
+
+}
+
+#endif // #ifndef dplyr_tools_utils_H
diff --git a/inst/include/tools/wrap_subset.h b/inst/include/tools/wrap_subset.h
index ae3b8cb..983bcdb 100644
--- a/inst/include/tools/wrap_subset.h
+++ b/inst/include/tools/wrap_subset.h
@@ -3,27 +3,16 @@
namespace dplyr {
- template <int RTYPE, typename Container>
- SEXP wrap_subset( SEXP input, const Container& indices ){
- int n = indices.size() ;
- Rcpp::Vector<RTYPE> res = Rcpp::no_init(n) ;
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
- STORAGE* ptr = Rcpp::internal::r_vector_start<RTYPE>( input ) ;
- for( int i=0; i<n; i++)
- res[i] = ptr[ indices[i] ] ;
- return res ;
- }
-
- template <int RTYPE, typename Container>
- SEXP wrap_subset_1_based( SEXP input, const Container& indices ){
- int n = indices.size() ;
- Rcpp::Vector<RTYPE> res = Rcpp::no_init(n) ;
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
- STORAGE* ptr = Rcpp::internal::r_vector_start<RTYPE>( input ) ;
- for( int i=0; i<n; i++)
- res[i] = ptr[ indices[i]-1 ] ;
- return res ;
- }
+template <int RTYPE, typename Container>
+SEXP wrap_subset(SEXP input, const Container& indices) {
+ int n = indices.size();
+ Rcpp::Vector<RTYPE> res = Rcpp::no_init(n);
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+ STORAGE* ptr = Rcpp::internal::r_vector_start<RTYPE>(input);
+ for (int i = 0; i < n; i++)
+ res[i] = ptr[ indices[i] ];
+ return res;
+}
}
diff --git a/man/add_rownames.Rd b/man/add_rownames.Rd
index 3c14bf4..b1c677d 100644
--- a/man/add_rownames.Rd
+++ b/man/add_rownames.Rd
@@ -12,11 +12,11 @@ add_rownames(df, var = "rowname")
\item{var}{Name of variable to use}
}
\description{
-Deprecated, use \code{\link[tibble]{rownames_to_column}} instead.
+Deprecated, use \code{\link[tibble:rownames_to_column]{tibble::rownames_to_column()}} instead.
}
\examples{
mtcars \%>\% tbl_df()
mtcars \%>\% add_rownames()
}
-
+\keyword{internal}
diff --git a/man/all_equal.Rd b/man/all_equal.Rd
index ed895e3..154ea3a 100644
--- a/man/all_equal.Rd
+++ b/man/all_equal.Rd
@@ -1,9 +1,9 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/all-equal.r
\name{all_equal}
-\alias{all.equal.tbl_df}
\alias{all_equal}
-\title{Flexible equality comparison for data frames.}
+\alias{all.equal.tbl_df}
+\title{Flexible equality comparison for data frames}
\usage{
all_equal(target, current, ignore_col_order = TRUE, ignore_row_order = TRUE,
convert = FALSE, ...)
@@ -21,16 +21,16 @@ all_equal(target, current, ignore_col_order = TRUE, ignore_row_order = TRUE,
\item{convert}{Should similar classes be converted? Currently this will
convert factor to character and integer to double.}
-\item{...}{Ignored. Needed for compatibility with \code{all.equal}.}
+\item{...}{Ignored. Needed for compatibility with \code{all.equal()}.}
}
\value{
\code{TRUE} if equal, otherwise a character vector describing
- the reasons why they're not equal. Use \code{\link{isTRUE}} if using the
- result in an \code{if} expression.
+the reasons why they're not equal. Use \code{\link[=isTRUE]{isTRUE()}} if using the
+result in an \code{if} expression.
}
\description{
-You can use \code{all_equal} with any data frame, and dplyr also provides
-\code{tbl_df} methods for \code{\link{all.equal}}.
+You can use \code{all_equal()} with any data frame, and dplyr also provides
+\code{tbl_df} methods for \code{\link[=all.equal]{all.equal()}}.
}
\examples{
scramble <- function(x) x[sample(nrow(x)), sample(ncol(x))]
@@ -49,4 +49,3 @@ all_equal(df1, df2)
# But you can request dplyr convert similar types
all_equal(df1, df2, convert = TRUE)
}
-
diff --git a/man/all_vars.Rd b/man/all_vars.Rd
new file mode 100644
index 0000000..212809c
--- /dev/null
+++ b/man/all_vars.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/colwise.R
+\name{all_vars}
+\alias{all_vars}
+\alias{any_vars}
+\title{Apply predicate to all variables}
+\usage{
+all_vars(expr)
+
+any_vars(expr)
+}
+\arguments{
+\item{expr}{A predicate expression. This variable supports
+\link[rlang:quasiquotation]{unquoting} and will be evaluated in the
+context of the data frame. It should return a logical vector.
+
+This argument is automatically \link[rlang:quo]{quoted} and later
+\link[rlang:eval_tidy]{evaluated} in the context of the data
+frame. It supports \link[rlang:quasiquotation]{unquoting}. See
+\code{vignette("programming")} for an introduction to these concepts.}
+}
+\description{
+These quoting functions signal to scoped filtering verbs
+(e.g. \code{\link[=filter_if]{filter_if()}} or \code{\link[=filter_all]{filter_all()}}) that a predicate expression
+should be applied to all relevant variables. The \code{all_vars()}
+variant takes the intersection of the predicate expressions with
+\code{&} while the \code{any_vars()} variant takes the union with \code{|}.
+}
+\seealso{
+\code{\link[=funs]{funs()}} and \code{\link[=vars]{vars()}} for other quoting functions that you
+can use with scoped verbs.
+}
diff --git a/man/arrange.Rd b/man/arrange.Rd
index 6610176..b45910f 100644
--- a/man/arrange.Rd
+++ b/man/arrange.Rd
@@ -2,47 +2,53 @@
% Please edit documentation in R/manip.r
\name{arrange}
\alias{arrange}
-\alias{arrange_}
-\title{Arrange rows by variables.}
+\alias{arrange.grouped_df}
+\title{Arrange rows by variables}
\usage{
arrange(.data, ...)
-arrange_(.data, ..., .dots)
+\method{arrange}{grouped_df}(.data, ..., .by_group = FALSE)
}
\arguments{
\item{.data}{A tbl. All main verbs are S3 generics and provide methods
-for \code{\link{tbl_df}}, \code{\link[dtplyr]{tbl_dt}} and \code{\link{tbl_sql}}.}
+for \code{\link[=tbl_df]{tbl_df()}}, \code{\link[dtplyr:tbl_dt]{dtplyr::tbl_dt()}} and \code{\link[dbplyr:tbl_dbi]{dbplyr::tbl_dbi()}}.}
\item{...}{Comma separated list of unquoted variable names. Use
-\code{\link{desc}} to sort a variable in descending order.}
+\code{\link[=desc]{desc()}} to sort a variable in descending order.}
-\item{.dots}{Used to work around non-standard evaluation. See
-\code{vignette("nse")} for details.}
+\item{.by_group}{If \code{TRUE}, will sort first by grouping variable. Applies to
+grouped data frames only.}
}
\value{
An object of the same class as \code{.data}.
-
- Data frame row names are silently dropped. To preserve, convert to an
- explicit variable.
}
\description{
-Use \code{\link{desc}} to sort a variable in descending order. Generally,
-this will not also automatically order by grouping variables.
+Use \code{\link[=desc]{desc()}} to sort a variable in descending order.
}
\section{Locales}{
+The sort order for character vectors will depend on the collating sequence
+of the locale in use: see \code{\link[=locales]{locales()}}.
+}
+
+\section{Tidy data}{
-Note that for local data frames, the ordering is done in C++ code which
-does not have access to the local specific ordering usually done in R.
-This means that strings are ordered as if in the C locale.
+When applied to a data frame, row names are silently dropped. To preserve,
+convert to an explicit variable with \code{\link[tibble:rownames_to_column]{tibble::rownames_to_column()}}.
}
+
\examples{
arrange(mtcars, cyl, disp)
arrange(mtcars, desc(disp))
+
+# grouped arrange ignores groups
+by_cyl <- mtcars \%>\% group_by(cyl)
+by_cyl \%>\% arrange(desc(wt))
+# Unless you specifically ask:
+by_cyl \%>\% arrange(desc(wt), .by_group = TRUE)
}
\seealso{
-Other single.table.verbs: \code{\link{filter}},
+Other single table verbs: \code{\link{filter}},
\code{\link{mutate}}, \code{\link{select}},
\code{\link{slice}}, \code{\link{summarise}}
}
-
diff --git a/man/arrange_all.Rd b/man/arrange_all.Rd
new file mode 100644
index 0000000..da57426
--- /dev/null
+++ b/man/arrange_all.Rd
@@ -0,0 +1,56 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/colwise-arrange.R
+\name{arrange_all}
+\alias{arrange_all}
+\alias{arrange_at}
+\alias{arrange_if}
+\title{Arrange rows by a selection of variables}
+\usage{
+arrange_all(.tbl, .funs = list(), ...)
+
+arrange_at(.tbl, .vars, .funs = list(), ...)
+
+arrange_if(.tbl, .predicate, .funs = list(), ...)
+}
+\arguments{
+\item{.tbl}{A \code{tbl} object.}
+
+\item{.funs}{List of function calls generated by \code{\link[=funs]{funs()}}, or a
+character vector of function names, or simply a function.
+
+Bare formulas are passed to \code{\link[rlang:as_function]{rlang::as_function()}} to create
+purrr-style lambda functions. Note that these lambda prevent
+hybrid evaluation from happening and it is thus more efficient to
+supply functions like \code{mean()} directly rather than in a
+lambda-formula.}
+
+\item{...}{Additional arguments for the function calls in
+\code{.funs}. These are evaluated only once, with \link[rlang:dots_list]{explicit
+splicing}.}
+
+\item{.vars}{A list of columns generated by \code{\link[=vars]{vars()}},
+or a character vector of column names, or a numeric vector of column
+positions.}
+
+\item{.predicate}{A predicate function to be applied to the columns
+or a logical vector. The variables for which \code{.predicate} is or
+returns \code{TRUE} are selected. This argument is passed to
+\code{\link[rlang:as_function]{rlang::as_function()}} and thus supports quosure-style lambda
+functions and strings representing function names.}
+}
+\description{
+These \link{scoped} variants of \code{\link[=arrange]{arrange()}} sort a data frame by a
+selection of variables. Like \code{\link[=arrange]{arrange()}}, you can modify the
+variables before ordering with \code{\link[=funs]{funs()}}.
+}
+\examples{
+df <- as_tibble(mtcars)
+df
+arrange_all(df)
+
+# You can supply a function that will be applied before taking the
+# ordering of the variables. The variables of the sorted tibble
+# keep their original values.
+arrange_all(df, desc)
+arrange_all(df, funs(desc(.)))
+}
diff --git a/man/as.table.tbl_cube.Rd b/man/as.table.tbl_cube.Rd
index fab6a2c..76bef0c 100644
--- a/man/as.table.tbl_cube.Rd
+++ b/man/as.table.tbl_cube.Rd
@@ -1,8 +1,8 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/tbl-cube.r
\name{as.table.tbl_cube}
-\alias{as.data.frame.tbl_cube}
\alias{as.table.tbl_cube}
+\alias{as.data.frame.tbl_cube}
\alias{as_data_frame.tbl_cube}
\title{Coerce a \code{tbl_cube} to other data structures}
\usage{
@@ -23,7 +23,6 @@
Supports conversion to tables, data frames, tibbles.
For a cube, the data frame returned by
- \code{\link[tibble]{as_data_frame}} resulting data frame contains the
- dimensions as character values (and not as factors).
+\code{\link[tibble:as_data_frame]{tibble::as_data_frame()}} resulting data frame contains the
+dimensions as character values (and not as factors).
}
-
diff --git a/man/as.tbl_cube.Rd b/man/as.tbl_cube.Rd
index 73d8451..aa426af 100644
--- a/man/as.tbl_cube.Rd
+++ b/man/as.tbl_cube.Rd
@@ -3,9 +3,9 @@
\name{as.tbl_cube}
\alias{as.tbl_cube}
\alias{as.tbl_cube.array}
-\alias{as.tbl_cube.data.frame}
-\alias{as.tbl_cube.matrix}
\alias{as.tbl_cube.table}
+\alias{as.tbl_cube.matrix}
+\alias{as.tbl_cube.data.frame}
\title{Coerce an existing data structure into a \code{tbl_cube}}
\usage{
as.tbl_cube(x, ...)
@@ -28,12 +28,11 @@ tables and data frames.}
\item{...}{Passed on to individual methods; otherwise ignored.}
-\item{dim_names}{names of the dimesions. Defaults to the names of}
+\item{dim_names}{names of the dimensions. Defaults to the names of}
\item{met_name}{a string to use as the name for the measure
-the \code{\link{dimnames}}.}
+the \code{\link[=dimnames]{dimnames()}}.}
}
\description{
Coerce an existing data structure into a \code{tbl_cube}
}
-
diff --git a/man/auto_copy.Rd b/man/auto_copy.Rd
index 7a97190..76ff330 100644
--- a/man/auto_copy.Rd
+++ b/man/auto_copy.Rd
@@ -2,12 +2,12 @@
% Please edit documentation in R/copy-to.r
\name{auto_copy}
\alias{auto_copy}
-\title{Copy tables to same source, if necessary.}
+\title{Copy tables to same source, if necessary}
\usage{
auto_copy(x, y, copy = FALSE, ...)
}
\arguments{
-\item{x, y}{\code{y} will be copied to \code{x}, if neccessary.}
+\item{x, y}{\code{y} will be copied to \code{x}, if necessary.}
\item{copy}{If \code{x} and \code{y} are not from the same data source,
and \code{copy} is \code{TRUE}, then \code{y} will be copied into the
@@ -17,6 +17,5 @@ it is a potentially expensive operation so you must opt into it.}
\item{...}{Other arguments passed on to methods.}
}
\description{
-Copy tables to same source, if necessary.
+Copy tables to same source, if necessary
}
-
diff --git a/man/backend_db.Rd b/man/backend_db.Rd
deleted file mode 100644
index 0b5f7e2..0000000
--- a/man/backend_db.Rd
+++ /dev/null
@@ -1,106 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/dbi-s3.r
-\name{backend_db}
-\alias{backend_db}
-\alias{db_analyze}
-\alias{db_begin}
-\alias{db_commit}
-\alias{db_create_index}
-\alias{db_create_indexes}
-\alias{db_create_table}
-\alias{db_data_type}
-\alias{db_drop_table}
-\alias{db_explain}
-\alias{db_has_table}
-\alias{db_insert_into}
-\alias{db_list_tables}
-\alias{db_query_fields}
-\alias{db_query_rows}
-\alias{db_rollback}
-\alias{db_save_query}
-\title{Database generics.}
-\usage{
-db_list_tables(con)
-
-db_has_table(con, table)
-
-db_data_type(con, fields)
-
-db_save_query(con, sql, name, temporary = TRUE, ...)
-
-db_begin(con, ...)
-
-db_commit(con, ...)
-
-db_rollback(con, ...)
-
-db_create_table(con, table, types, temporary = FALSE, ...)
-
-db_insert_into(con, table, values, ...)
-
-db_create_indexes(con, table, indexes = NULL, unique = FALSE, ...)
-
-db_create_index(con, table, columns, name = NULL, unique = FALSE, ...)
-
-db_drop_table(con, table, force = FALSE, ...)
-
-db_analyze(con, table, ...)
-
-db_explain(con, sql, ...)
-
-db_query_fields(con, sql, ...)
-
-db_query_rows(con, sql, ...)
-}
-\arguments{
-\item{con}{A database connection.}
-
-\item{table}{A string, the table name.}
-
-\item{fields}{A list of fields, as in a data frame.}
-}
-\value{
-Usually a logical value indicating success. Most failures should generate
- an error. However, \code{db_has_table()} should return \code{NA} if
- temporary tables cannot be listed with \code{dbListTables} (due to backend
- API limitations for example). As a result, you methods will rely on the
- backend to throw an error if a table exists when it shouldn't.
-}
-\description{
-These generics execute actions on the database. Most generics have a method
-for \code{DBIConnection} which typically just call the standard DBI S4
-method.
-}
-\details{
-Note, a few backend methods do not call the standard DBI S4 methods including
-\itemize{
-\item \code{db_data_type}: Calls DBI's \code{dbDataType} for every field
-(e.g. data frame column) and returns a vector of corresponding SQL data
-types
-\item \code{db_save_query}: Builds and executes \code{CREATE [TEMPORARY]
-TABLE <table> ...} SQL command.
-\item \code{db_create_table}: Builds and executes \code{CREATE [TEMPORARY]
-TABLE <table> ...} SQL command.
-\item \code{db_create_index}: Builds and executes \code{CREATE INDEX <name>
-ON <table>} SQL command.
-\item \code{db_drop_table}: Builds and executes \code{DROP TABLE [IF EXISTS]
- <table>} SQL command.
-\item \code{db_analyze}: Builds and executes \code{ANALYZE <table>} SQL
-command.
-\item \code{db_insert_into} and \code{db_explain}: do not have methods
-calling corresponding DBI methods. The latter because no underlying DBI S4
-method exists and the former because calls to the corresponding DBI S4
-method (\code{dbWriteTable}) need to be able to specify an appropriate
-combination of values for non-standard \code{append} and \code{overwrite}
-arguments.
-}
-
-Currently, \code{copy_to} is the only user of \code{db_begin()}, \code{db_commit()},
-\code{db_rollback()}, \code{db_create_table()}, \code{db_insert_into()},
-\code{db_create_indexes()}, \code{db_drop_table()} and
-\code{db_analyze()}. If you find yourself overriding many of these
-functions it may suggest that you should just override \code{\link{copy_to}}
-instead.
-}
-\keyword{internal}
-
diff --git a/man/backend_dbplyr.Rd b/man/backend_dbplyr.Rd
new file mode 100644
index 0000000..b4b41e5
--- /dev/null
+++ b/man/backend_dbplyr.Rd
@@ -0,0 +1,148 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dbplyr.R
+\name{backend_dbplyr}
+\alias{backend_dbplyr}
+\alias{backend_dbplyr}
+\alias{db_desc}
+\alias{backend_dbplyr}
+\alias{sql_translate_env}
+\alias{backend_dbplyr}
+\alias{db_list_tables}
+\alias{backend_dbplyr}
+\alias{db_has_table}
+\alias{backend_dbplyr}
+\alias{db_data_type}
+\alias{backend_dbplyr}
+\alias{db_save_query}
+\alias{backend_dbplyr}
+\alias{db_begin}
+\alias{backend_dbplyr}
+\alias{db_commit}
+\alias{backend_dbplyr}
+\alias{db_rollback}
+\alias{backend_dbplyr}
+\alias{db_write_table}
+\alias{backend_dbplyr}
+\alias{db_create_table}
+\alias{backend_dbplyr}
+\alias{db_insert_into}
+\alias{backend_dbplyr}
+\alias{db_create_indexes}
+\alias{backend_dbplyr}
+\alias{db_create_index}
+\alias{backend_dbplyr}
+\alias{db_drop_table}
+\alias{backend_dbplyr}
+\alias{db_analyze}
+\alias{db_explain}
+\alias{db_query_fields}
+\alias{db_query_rows}
+\alias{sql_select}
+\alias{sql_subquery}
+\alias{sql_join}
+\alias{sql_semi_join}
+\alias{sql_set_op}
+\alias{sql_escape_string}
+\alias{sql_escape_ident}
+\title{Database and SQL generics.}
+\usage{
+db_desc(x)
+
+sql_translate_env(con)
+
+db_list_tables(con)
+
+db_has_table(con, table)
+
+db_data_type(con, fields)
+
+db_save_query(con, sql, name, temporary = TRUE, ...)
+
+db_begin(con, ...)
+
+db_commit(con, ...)
+
+db_rollback(con, ...)
+
+db_write_table(con, table, types, values, temporary = FALSE, ...)
+
+db_create_table(con, table, types, temporary = FALSE, ...)
+
+db_insert_into(con, table, values, ...)
+
+db_create_indexes(con, table, indexes = NULL, unique = FALSE, ...)
+
+db_create_index(con, table, columns, name = NULL, unique = FALSE, ...)
+
+db_drop_table(con, table, force = FALSE, ...)
+
+db_analyze(con, table, ...)
+
+db_explain(con, sql, ...)
+
+db_query_fields(con, sql, ...)
+
+db_query_rows(con, sql, ...)
+
+sql_select(con, select, from, where = NULL, group_by = NULL,
+ having = NULL, order_by = NULL, limit = NULL, distinct = FALSE, ...)
+
+sql_subquery(con, from, name = random_table_name(), ...)
+
+sql_join(con, x, y, vars, type = "inner", by = NULL, ...)
+
+sql_semi_join(con, x, y, anti = FALSE, by = NULL, ...)
+
+sql_set_op(con, x, y, method)
+
+sql_escape_string(con, x)
+
+sql_escape_ident(con, x)
+}
+\arguments{
+\item{con}{A database connection.}
+
+\item{table}{A string, the table name.}
+
+\item{fields}{A list of fields, as in a data frame.}
+}
+\value{
+Usually a logical value indicating success. Most failures should generate
+an error. However, \code{db_has_table()} should return \code{NA} if
+temporary tables cannot be listed with \code{\link[DBI:dbListTables]{DBI::dbListTables()}} (due to backend
+API limitations for example). As a result, you methods will rely on the
+backend to throw an error if a table exists when it shouldn't.
+}
+\description{
+The \code{sql_} generics are used to build the different types of SQL queries.
+The default implementations in dbplyr generates ANSI 92 compliant SQL.
+The \code{db_} generics execute actions on the database. The default
+implementations in dbplyr typically just call the standard DBI S4
+method.
+}
+\details{
+A few backend methods do not call the standard DBI S4 methods including
+\itemize{
+\item \code{db_data_type()}: Calls \code{\link[DBI:dbDataType]{DBI::dbDataType()}} for every field
+(e.g. data frame column) and returns a vector of corresponding SQL data
+types
+\item \code{db_save_query()}: Builds and executes a
+\code{CREATE [TEMPORARY] TABLE <table> ...} SQL command.
+\item \code{db_create_index()}: Builds and executes a
+\code{CREATE INDEX <name> ON <table>} SQL command.
+\item \code{db_drop_table()}: Builds and executes a
+\code{DROP TABLE [IF EXISTS] <table>} SQL command.
+\item \code{db_analyze()}: Builds and executes an
+\code{ANALYZE <table>} SQL command.
+}
+
+Currently, \code{\link[=copy_to]{copy_to()}} is the only user of \code{db_begin()}, \code{db_commit()},
+\code{db_rollback()}, \code{db_write_table()}, \code{db_create_indexes()}, \code{db_drop_table()} and
+\code{db_analyze()}. If you find yourself overriding many of these
+functions it may suggest that you should just override \code{copy_to()}
+instead.
+
+\code{db_create_table()} and \code{db_insert_into()} have been deprecated
+in favour of \code{db_write_table()}.
+}
+\keyword{internal}
diff --git a/man/backend_sql.Rd b/man/backend_sql.Rd
deleted file mode 100644
index e7153b1..0000000
--- a/man/backend_sql.Rd
+++ /dev/null
@@ -1,41 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/sql-generic.R
-\name{backend_sql}
-\alias{backend_sql}
-\alias{sql_escape_ident}
-\alias{sql_escape_string}
-\alias{sql_join}
-\alias{sql_select}
-\alias{sql_semi_join}
-\alias{sql_set_op}
-\alias{sql_subquery}
-\title{SQL generation.}
-\usage{
-sql_select(con, select, from, where = NULL, group_by = NULL,
- having = NULL, order_by = NULL, limit = NULL, distinct = FALSE, ...)
-
-sql_subquery(con, from, name = random_table_name(), ...)
-
-sql_join(con, x, y, type = "inner", by = NULL, ...)
-
-sql_semi_join(con, x, y, anti = FALSE, by = NULL, ...)
-
-sql_set_op(con, x, y, method)
-
-sql_escape_string(con, x)
-
-sql_escape_ident(con, x)
-}
-\arguments{
-\item{con}{A database connection.}
-}
-\value{
-An SQL string.
-}
-\description{
-These generics are used to run build various SQL queries. A default method
-generates ANSI 92 compliant SQL, but variations in SQL across databases means
-that it's likely that a backend will require at least a few methods.
-}
-\keyword{internal}
-
diff --git a/man/backend_src.Rd b/man/backend_src.Rd
deleted file mode 100644
index 451085f..0000000
--- a/man/backend_src.Rd
+++ /dev/null
@@ -1,20 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/dbi-s3.r
-\name{backend_src}
-\alias{backend_src}
-\alias{sql_translate_env}
-\alias{sql_translate_env.NULL}
-\alias{src_desc}
-\title{Source generics.}
-\usage{
-src_desc(x)
-
-sql_translate_env(con)
-
-\method{sql_translate_env}{NULL}(con)
-}
-\description{
-These generics retrieve metadata for a given src.
-}
-\keyword{internal}
-
diff --git a/man/band_members.Rd b/man/band_members.Rd
new file mode 100644
index 0000000..6f08301
--- /dev/null
+++ b/man/band_members.Rd
@@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data-bands.R
+\docType{data}
+\name{band_members}
+\alias{band_members}
+\alias{band_instruments}
+\alias{band_instruments2}
+\title{Band membership}
+\format{Each is a tibble with two variables and three observations}
+\usage{
+band_members
+
+band_instruments
+
+band_instruments2
+}
+\description{
+These data sets describe band members of the Beatles and Rolling Stones. They
+are toy data sets that can be displayed in their entirety on a slide (e.g. to
+demonstrate a join).
+}
+\details{
+\code{band_instruments} and \code{band_instruments2} contain the same data but use
+different column names for the first column of the data set.
+\code{band_instruments} uses \code{name}, which matches the name of the key column of
+\code{band_members}; \code{band_instruments2} uses \code{artist}, which does not.
+}
+\examples{
+band_members
+band_instruments
+band_instruments2
+}
+\keyword{datasets}
diff --git a/man/bench_compare.Rd b/man/bench_compare.Rd
index c0492e1..6c0dd3d 100644
--- a/man/bench_compare.Rd
+++ b/man/bench_compare.Rd
@@ -4,21 +4,33 @@
\alias{bench_compare}
\alias{bench_tbls}
\alias{compare_tbls}
+\alias{compare_tbls2}
\alias{eval_tbls}
+\alias{eval_tbls2}
\title{Evaluate, compare, benchmark operations of a set of srcs.}
\usage{
bench_tbls(tbls, op, ..., times = 10)
compare_tbls(tbls, op, ref = NULL, compare = equal_data_frame, ...)
+compare_tbls2(tbls_x, tbls_y, op, ref = NULL, compare = equal_data_frame,
+ ...)
+
eval_tbls(tbls, op)
+
+eval_tbls2(tbls_x, tbls_y, op)
}
\arguments{
-\item{tbls}{A list of \code{\link{tbl}}s.}
+\item{tbls, tbls_x, tbls_y}{A list of \code{\link[=tbl]{tbl()}}s.}
\item{op}{A function with a single argument, called often with each
element of \code{tbls}.}
+\item{\dots}{For \code{compare_tbls()}: additional parameters passed on the
+\code{compare()} function
+
+For \code{bench_tbls()}: additional benchmarks to run.}
+
\item{times}{For benchmarking, the number of times each operation is
repeated.}
@@ -27,20 +39,15 @@ supplied, defaults to the results from the first \code{src}.}
\item{compare}{A function used to compare the results. Defaults to
\code{equal_data_frame} which ignores the order of rows and columns.}
-
-\item{\dots}{For \code{compare_tbls}: additional parameters passed on the
- \code{compare} function
-
- For \code{bench_tbls}: additional benchmarks to run.}
}
\value{
-\code{eval_tbls}: a list of data frames.
+\code{eval_tbls()}: a list of data frames.
- \code{compare_tbls}: an invisible \code{TRUE} on success, otherwise
- an error is thrown.
+\code{compare_tbls()}: an invisible \code{TRUE} on success, otherwise
+an error is thrown.
- \code{bench_tbls}: an object of class
- \code{\link[microbenchmark]{microbenchmark}}
+\code{bench_tbls()}: an object of class
+\code{\link[microbenchmark:microbenchmark]{microbenchmark::microbenchmark()}}
}
\description{
These functions support the comparison of results and timings across
@@ -79,6 +86,6 @@ bench_tbls(two_tables, op, times = 2)
}
}
\seealso{
-\code{\link{src_local}} for working with local data
+\code{\link[=src_local]{src_local()}} for working with local data
}
-
+\keyword{internal}
diff --git a/man/between.Rd b/man/between.Rd
index c582948..6659b4f 100644
--- a/man/between.Rd
+++ b/man/between.Rd
@@ -20,4 +20,3 @@ appropriate SQL for remote tables.
x <- rnorm(1e2)
x[between(x, -1, 1)]
}
-
diff --git a/man/bind.Rd b/man/bind.Rd
index 04d651c..cd50557 100644
--- a/man/bind.Rd
+++ b/man/bind.Rd
@@ -1,13 +1,15 @@
% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/bind.r
+% Please edit documentation in R/bind.r, R/rbind.R
\name{bind}
\alias{bind}
-\alias{bind_cols}
+\alias{rbind_all}
+\alias{rbind_list}
\alias{bind_rows}
+\alias{bind_cols}
\alias{combine}
-\alias{rbind_all}
\alias{rbind_list}
-\title{Efficiently bind multiple data frames by row and column.}
+\alias{rbind_all}
+\title{Efficiently bind multiple data frames by row and column}
\usage{
bind_rows(..., .id = NULL)
@@ -18,47 +20,86 @@ combine(...)
\arguments{
\item{...}{Data frames to combine.
- Each argument can either be a data frame, a list that could be a data
- frame, or a list of data frames.
+Each argument can either be a data frame, a list that could be a data
+frame, or a list of data frames.
+
+When row-binding, columns are matched by name, and any missing
+columns with be filled with NA.
- When column-binding, rows are matched by position, not value so all data
- frames must have the same number of rows. To match by value, not
- position, see \code{left_join} etc. When row-binding, columns are
- matched by name, and any values that don't match will be filled with NA.}
+When column-binding, rows are matched by position, so all data
+frames must have the same number of rows. To match by value, not
+position, see \link{join}.}
-\item{.id}{Data frames identifier.
+\item{.id}{Data frame identifier.
- When \code{.id} is supplied, a new column of identifiers is
- created to link each row to its original data frame. The labels
- are taken from the named arguments to \code{bind_rows()}. When a
- list of data frames is supplied, the labels are taken from the
- names of the list. If no names are found a numeric sequence is
- used instead.}
+When \code{.id} is supplied, a new column of identifiers is
+created to link each row to its original data frame. The labels
+are taken from the named arguments to \code{bind_rows()}. When a
+list of data frames is supplied, the labels are taken from the
+names of the list. If no names are found a numeric sequence is
+used instead.}
}
\value{
-\code{bind_rows} and \code{bind_cols} return the same type as
- the first input, either a data frame, \code{tbl_df}, or \code{grouped_df}.
+\code{bind_rows()} and \code{bind_cols()} return the same type as
+the first input, either a data frame, \code{tbl_df}, or \code{grouped_df}.
}
\description{
This is an efficient implementation of the common pattern of
\code{do.call(rbind, dfs)} or \code{do.call(cbind, dfs)} for binding many
-data frames into one. \code{combine()} acts like \code{\link{c}()} or
-\code{\link{unlist}()} but uses consistent dplyr coercion rules.
+data frames into one. \code{combine()} acts like \code{\link[=c]{c()}} or
+\code{\link[=unlist]{unlist()}} but uses consistent dplyr coercion rules.
+}
+\details{
+The output of \code{bind_rows()} will contain a column if that column
+appears in any of the inputs.
}
\section{Deprecated functions}{
\code{rbind_list()} and \code{rbind_all()} have been deprecated. Instead use
\code{bind_rows()}.
}
+
\examples{
one <- mtcars[1:4, ]
two <- mtcars[11:14, ]
-# You can either supply data frames as arguments
+# You can supply data frames as arguments:
bind_rows(one, two)
-# Or a single argument containing a list of data frames
+
+# The contents of lists is automatically spliced:
bind_rows(list(one, two))
bind_rows(split(mtcars, mtcars$cyl))
+bind_rows(list(one, two), list(two, one))
+
+
+# In addition to data frames, you can supply vectors. In the rows
+# direction, the vectors represent rows and should have inner
+# names:
+bind_rows(
+ c(a = 1, b = 2),
+ c(a = 3, b = 4)
+)
+
+# You can mix vectors and data frames:
+bind_rows(
+ c(a = 1, b = 2),
+ data_frame(a = 3:4, b = 5:6),
+ c(a = 7, b = 8)
+)
+
+
+# Note that for historical reasons, lists containg vectors are
+# always treated as data frames. Thus their vectors are treated as
+# columns rather than rows, and their inner names are ignored:
+ll <- list(
+ a = c(A = 1, B = 2),
+ b = c(A = 3, B = 4)
+)
+bind_rows(ll)
+
+# You can circumvent that behaviour with explicit splicing:
+bind_rows(!!! ll)
+
# When you supply a column name with the `.id` argument, a new
# column is created to link each row to its original data frame
@@ -85,4 +126,3 @@ unlist(list(f1, f2))
combine(f1, f2)
combine(list(f1, f2))
}
-
diff --git a/man/build_sql.Rd b/man/build_sql.Rd
deleted file mode 100644
index 0c86755..0000000
--- a/man/build_sql.Rd
+++ /dev/null
@@ -1,37 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/sql-escape.r
-\name{build_sql}
-\alias{build_sql}
-\title{Build a SQL string.}
-\usage{
-build_sql(..., .env = parent.frame(), con = NULL)
-}
-\arguments{
-\item{...}{input to convert to SQL. Use \code{\link{sql}} to preserve
-user input as is (dangerous), and \code{\link{ident}} to label user
-input as sql identifiers (safe)}
-
-\item{.env}{the environment in which to evalute the arguments. Should not
-be needed in typical use.}
-
-\item{con}{database connection; used to select correct quoting characters.}
-}
-\description{
-This is a convenience function that should prevent sql injection attacks
-(which in the context of dplyr are most likely to be accidental not
-deliberate) by automatically escaping all expressions in the input, while
-treating bare strings as sql. This is unlikely to prevent any serious
-attack, but should make it unlikely that you produce invalid sql.
-}
-\examples{
-build_sql("SELECT * FROM TABLE")
-x <- "TABLE"
-build_sql("SELECT * FROM ", x)
-build_sql("SELECT * FROM ", ident(x))
-build_sql("SELECT * FROM ", sql(x))
-
-# http://xkcd.com/327/
-name <- "Robert'); DROP TABLE Students;--"
-build_sql("INSERT INTO Students (Name) VALUES (", name, ")")
-}
-
diff --git a/man/case_when.Rd b/man/case_when.Rd
index 0107526..9f4aa50 100644
--- a/man/case_when.Rd
+++ b/man/case_when.Rd
@@ -2,26 +2,28 @@
% Please edit documentation in R/case_when.R
\name{case_when}
\alias{case_when}
-\title{A general vectorised if.}
+\title{A general vectorised if}
\usage{
case_when(...)
}
\arguments{
\item{...}{A sequence of two-sided formulas. The left hand side (LHS)
- determines which values match this case. The right hand side (RHS)
- provides the replacement value.
+determines which values match this case. The right hand side (RHS)
+provides the replacement value.
- The LHS must evaluate to a logical vector. Each logical vector can
- either have length 1 or a common length. All RHSs must evaluate to
- the same type of vector.}
+The LHS must evaluate to a logical vector. Each logical vector can
+either have length 1 or a common length. All RHSs must evaluate to
+the same type of vector.
+
+These dots are evaluated with \link[rlang:dots_list]{explicit splicing}.}
}
\value{
-A vector as long as the longest LHS, with the type (and
- attributes) of the first RHS. Inconsistent lengths of types will
- generate an error.
+A vector as long as the longest LHS or RHS, with the type (and
+attributes) of the first RHS. Inconsistent lengths or types will
+generate an error.
}
\description{
-This function allows you to vectorise mutiple \code{if} and \code{else if}
+This function allows you to vectorise multiple \code{if} and \code{else if}
statements. It is an R equivalent of the SQL \code{CASE WHEN} statement.
}
\examples{
@@ -41,5 +43,26 @@ case_when(
x \%\% 7 == 0 ~ "buzz",
x \%\% 35 == 0 ~ "fizz buzz"
)
-}
+# case_when is particularly useful inside mutate when you want to
+# create a new variable that relies on a complex combination of existing
+# variables
+starwars \%>\%
+ select(name:mass, gender, species) \%>\%
+ mutate(
+ type = case_when(
+ height > 200 | mass > 200 ~ "large",
+ species == "Droid" ~ "robot",
+ TRUE ~ "other"
+ )
+ )
+
+# Dots support splicing:
+patterns <- list(
+ x \%\% 35 == 0 ~ "fizz buzz",
+ x \%\% 5 == 0 ~ "fizz",
+ x \%\% 7 == 0 ~ "buzz",
+ TRUE ~ as.character(x)
+)
+case_when(!!! patterns)
+}
diff --git a/man/check_dbplyr.Rd b/man/check_dbplyr.Rd
new file mode 100644
index 0000000..d7b60b8
--- /dev/null
+++ b/man/check_dbplyr.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compat-dbplyr.R
+\name{check_dbplyr}
+\alias{check_dbplyr}
+\alias{wrap_dbplyr_obj}
+\title{dbplyr compatibility functions}
+\usage{
+check_dbplyr()
+
+wrap_dbplyr_obj(obj_name)
+}
+\description{
+In dplyr 0.6.0, a number of database and SQL functions moved from dplyr to
+dbplyr. The generic functions stayed in dplyr (since there is no easy way
+to conditionally import a generic from different packages), but many other
+SQL and database helper functions moved. If you have written a backend,
+these functions generate the code you need to work with both dplyr 0.5.0
+dplyr 0.6.0.
+}
+\examples{
+if (requireNamespace("dbplyr", quietly = TRUE)) {
+wrap_dbplyr_obj("build_sql")
+wrap_dbplyr_obj("base_agg")
+}
+}
+\keyword{internal}
diff --git a/man/coalesce.Rd b/man/coalesce.Rd
index 711c64c..addc0f3 100644
--- a/man/coalesce.Rd
+++ b/man/coalesce.Rd
@@ -4,18 +4,20 @@
\alias{coalesce}
\title{Find first non-missing element}
\usage{
-coalesce(x, ...)
+coalesce(...)
}
\arguments{
-\item{x, ...}{Vectors. All inputs should either be length 1, or the
-same length as \code{x}}
+\item{...}{Vectors. All inputs should either be length 1, or the
+same length as the first argument.
+
+These dots are evaluated with \link[rlang:dots_list]{explicit splicing}.}
}
\value{
-A vector the same length as \code{x} with missing values replaced
- by the first non-missing value.
+A vector the same length as the first \code{...} argument with
+missing values replaced by the first non-missing value.
}
\description{
-Given a set of vectors, \code{coelesce} finds the first non-missing value
+Given a set of vectors, \code{coalesce()} finds the first non-missing value
at each position. This is inspired by the SQL \code{COALESCE} function
which does the same thing for \code{NULL}s.
}
@@ -28,8 +30,14 @@ coalesce(x, 0L)
y <- c(1, 2, NA, NA, 5)
z <- c(NA, NA, 3, 4, 5)
coalesce(y, z)
+
+# Supply lists by splicing them into dots:
+vecs <- list(
+ c(1, 2, NA, NA, 5),
+ c(NA, NA, 3, 4, 5)
+)
+coalesce(!!! vecs)
}
\seealso{
-\code{\link{na_if}()} to replace specified values with a \code{NA}.
+\code{\link[=na_if]{na_if()}} to replace specified values with a \code{NA}.
}
-
diff --git a/man/common_by.Rd b/man/common_by.Rd
index 210c533..8e6326c 100644
--- a/man/common_by.Rd
+++ b/man/common_by.Rd
@@ -10,4 +10,3 @@ common_by(by = NULL, x, y)
Extract out common by variables
}
\keyword{internal}
-
diff --git a/man/compute.Rd b/man/compute.Rd
index b481592..cb1eab6 100644
--- a/man/compute.Rd
+++ b/man/compute.Rd
@@ -1,65 +1,54 @@
% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/compute-collect.r, R/tbl-sql.r
+% Please edit documentation in R/compute-collect.r
\name{compute}
-\alias{collapse}
-\alias{collect}
\alias{compute}
-\alias{compute.tbl_sql}
-\title{Compute a lazy tbl.}
+\alias{collect}
+\alias{collapse}
+\title{Force computation of a database query}
\usage{
compute(x, name = random_table_name(), ...)
collect(x, ...)
collapse(x, ...)
-
-\method{compute}{tbl_sql}(x, name = random_table_name(), temporary = TRUE,
- unique_indexes = list(), indexes = list(), ...)
}
\arguments{
-\item{x}{a data tbl}
-
-\item{name}{name of temporary table on database.}
-
-\item{...}{other arguments passed on to methods}
-
-\item{temporary}{if \code{TRUE}, will create a temporary table that is
-local to this connection and will be automatically deleted when the
-connection expires}
+\item{x}{A tbl}
-\item{unique_indexes}{a list of character vectors. Each element of the list
-will create a new unique index over the specified column(s). Duplicate rows
-will result in failure.}
+\item{name}{Name of temporary table on database.}
-\item{indexes}{a list of character vectors. Each element of the list
-will create a new index.}
+\item{...}{Other arguments passed on to methods}
}
\description{
-\code{compute} forces computation of lazy tbls, leaving data in the remote
-source. \code{collect} also forces computation, but will bring data back into
-an R data.frame (stored in a \code{\link{tbl_df}}). \code{collapse} doesn't
-force computation, but collapses a complex tbl into a form that additional
-restrictions can be placed on.
+\code{compute()} stores results in a remote temporary table.
+\code{collect()} retrieves data into a local tibble.
+\code{collapse()} is slightly different: it doesn't force computation, but
+instead forces generation of the SQL query. This is sometimes needed to work
+around bugs in dplyr's SQL generation.
}
-\section{Grouping}{
-
-
-\code{compute} and \code{collect} preserve grouping, \code{collapse} drops
-it.
+\details{
+All functions preserve grouping and ordering.
}
\examples{
-\donttest{
-if (require("RSQLite") && has_lahman("sqlite")) {
- batting <- tbl(lahman_sqlite(), "Batting")
- remote <- select(filter(batting, yearID > 2010 && stint == 1), playerID:H)
- remote2 <- collapse(remote)
- cached <- compute(remote)
- local <- collect(remote)
-}
+if (require(dbplyr)) {
+ mtcars2 <- src_memdb() \%>\%
+ copy_to(mtcars, name = "mtcars2-cc", overwrite = TRUE)
+
+ remote <- mtcars2 \%>\%
+ filter(cyl == 8) \%>\%
+ select(mpg:drat)
+
+ # Compute query and save in remote table
+ compute(remote)
+
+ # Compute query bring back to this session
+ collect(remote)
+
+ # Creates a fresh query based on the generated SQL
+ collapse(remote)
}
}
\seealso{
-\code{\link{copy_to}} which is the conceptual opposite: it
- takes a local data frame and makes it available to the remote source.
+\code{\link[=copy_to]{copy_to()}}, the opposite of \code{collect()}: it takes a local data
+frame and uploads it to the remote source.
}
-
diff --git a/man/copy_to.Rd b/man/copy_to.Rd
index e95c292..fa671a8 100644
--- a/man/copy_to.Rd
+++ b/man/copy_to.Rd
@@ -2,9 +2,9 @@
% Please edit documentation in R/copy-to.r
\name{copy_to}
\alias{copy_to}
-\title{Copy a local data frame to a remote src.}
+\title{Copy a local data frame to a remote src}
\usage{
-copy_to(dest, df, name = deparse(substitute(df)), ...)
+copy_to(dest, df, name = deparse(substitute(df)), overwrite = FALSE, ...)
}
\arguments{
\item{dest}{remote data source}
@@ -13,6 +13,10 @@ copy_to(dest, df, name = deparse(substitute(df)), ...)
\item{name}{name for new remote table.}
+\item{overwrite}{If \code{TRUE}, will overwrite an existing table with
+name \code{name}. If \code{FALSE}, will throw an error if \code{name} already
+exists.}
+
\item{...}{other parameters passed to methods.}
}
\value{
@@ -23,4 +27,13 @@ This function uploads a local data frame into a remote data source, creating
the table definition as needed. Wherever possible, the new object will be
temporary, limited to the current connection to the source.
}
-
+\examples{
+\dontrun{
+iris2 <- dbplyr::src_memdb() \%>\% copy_to(iris, overwrite = TRUE)
+iris2
+}
+}
+\seealso{
+\code{\link[=collect]{collect()}} for the opposite action; downloading remote data into
+a local dbl.
+}
diff --git a/man/copy_to.src_sql.Rd b/man/copy_to.src_sql.Rd
deleted file mode 100644
index f5ca2d4..0000000
--- a/man/copy_to.src_sql.Rd
+++ /dev/null
@@ -1,60 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/tbl-sql.r
-\name{copy_to.src_sql}
-\alias{copy_to.src_sql}
-\title{Copy a local data frame to a sqlite src.}
-\usage{
-\method{copy_to}{src_sql}(dest, df, name = deparse(substitute(df)),
- types = NULL, temporary = TRUE, unique_indexes = NULL, indexes = NULL,
- analyze = TRUE, ...)
-}
-\arguments{
-\item{dest}{remote data source}
-
-\item{df}{local data frame}
-
-\item{name}{name for new remote table.}
-
-\item{types}{a character vector giving variable types to use for the columns.
-See \url{http://www.sqlite.org/datatype3.html} for available types.}
-
-\item{temporary}{if \code{TRUE}, will create a temporary table that is
-local to this connection and will be automatically deleted when the
-connection expires}
-
-\item{unique_indexes}{a list of character vectors. Each element of the list
-will create a new unique index over the specified column(s). Duplicate rows
-will result in failure.}
-
-\item{indexes}{a list of character vectors. Each element of the list
-will create a new index.}
-
-\item{analyze}{if \code{TRUE} (the default), will automatically ANALYZE the
-new table so that the query optimiser has useful information.}
-
-\item{...}{other parameters passed to methods.}
-}
-\value{
-a sqlite \code{\link{tbl}} object
-}
-\description{
-This standard method works for all sql sources.
-}
-\examples{
-if (requireNamespace("RSQLite")) {
-db <- src_sqlite(tempfile(), create = TRUE)
-
-iris2 <- copy_to(db, iris)
-mtcars$model <- rownames(mtcars)
-mtcars2 <- copy_to(db, mtcars, indexes = list("model"))
-
-explain(filter(mtcars2, model == "Hornet 4 Drive"))
-
-# Note that tables are temporary by default, so they're not
-# visible from other connections to the same database.
-src_tbls(db)
-db2 <- src_sqlite(db$path)
-src_tbls(db2)
-}
-}
-
diff --git a/man/cumall.Rd b/man/cumall.Rd
index 657adbe..713bd18 100644
--- a/man/cumall.Rd
+++ b/man/cumall.Rd
@@ -13,12 +13,11 @@ cumany(x)
cummean(x)
}
\arguments{
-\item{x}{For \code{cumall} & \code{cumany}, a logical vector; for
-\code{cummean} an integer or numeric vector}
+\item{x}{For \code{cumall()} and \code{cumany()}, a logical vector; for
+\code{cummean()} an integer or numeric vector}
}
\description{
-dplyr adds \code{cumall}, \code{cumany}, and \code{cummean} to complete
+dplyr adds \code{cumall()}, \code{cumany()}, and \code{cummean()} to complete
R's set of cumulate functions to match the aggregation functions available
in most databases
}
-
diff --git a/man/desc.Rd b/man/desc.Rd
index 4555799..73d9f26 100644
--- a/man/desc.Rd
+++ b/man/desc.Rd
@@ -2,7 +2,7 @@
% Please edit documentation in R/desc.r
\name{desc}
\alias{desc}
-\title{Descending order.}
+\title{Descending order}
\usage{
desc(x)
}
@@ -11,11 +11,14 @@ desc(x)
}
\description{
Transform a vector into a format that will be sorted in descending order.
+This is useful within \code{\link[=arrange]{arrange()}}.
}
\examples{
desc(1:10)
desc(factor(letters))
+
first_day <- seq(as.Date("1910/1/1"), as.Date("1920/1/1"), "years")
desc(first_day)
-}
+starwars \%>\% arrange(desc(mass))
+}
diff --git a/man/dim_desc.Rd b/man/dim_desc.Rd
index 491b08a..5bdd14f 100644
--- a/man/dim_desc.Rd
+++ b/man/dim_desc.Rd
@@ -17,4 +17,3 @@ substituting \code{NA} with ?? (for SQL queries).
dim_desc(mtcars)
}
\keyword{internal}
-
diff --git a/man/distinct.Rd b/man/distinct.Rd
index 83a780f..f59ca6c 100644
--- a/man/distinct.Rd
+++ b/man/distinct.Rd
@@ -2,12 +2,9 @@
% Please edit documentation in R/distinct.R
\name{distinct}
\alias{distinct}
-\alias{distinct_}
-\title{Select distinct/unique rows.}
+\title{Select distinct/unique rows}
\usage{
distinct(.data, ..., .keep_all = FALSE)
-
-distinct_(.data, ..., .dots, .keep_all = FALSE)
}
\arguments{
\item{.data}{a tbl}
@@ -19,16 +16,13 @@ row will be preserved. If omitted, will use all variables.}
\item{.keep_all}{If \code{TRUE}, keep all variables in \code{.data}.
If a combination of \code{...} is not distinct, this keeps the
first row of values.}
-
-\item{.dots}{Used to work around non-standard evaluation. See
-\code{vignette("nse")} for details.}
}
\description{
Retain only unique/distinct rows from an input tbl. This is similar
-to \code{\link{unique.data.frame}}, but considerably faster.
+to \code{\link[=unique.data.frame]{unique.data.frame()}}, but considerably faster.
}
\examples{
-df <- data.frame(
+df <- tibble(
x = sample(10, 100, rep = TRUE),
y = sample(10, 100, rep = TRUE)
)
@@ -45,5 +39,13 @@ distinct(df, y, .keep_all = TRUE)
# You can also use distinct on computed variables
distinct(df, diff = abs(x - y))
-}
+# The same behaviour applies for grouped data frames
+# except that the grouping variables are always included
+df <- tibble(
+ g = c(1, 1, 2, 2),
+ x = c(1, 1, 2, 1)
+) \%>\% group_by(g)
+df \%>\% distinct()
+df \%>\% distinct(x)
+}
diff --git a/man/do.Rd b/man/do.Rd
index 69ddbdc..45f2454 100644
--- a/man/do.Rd
+++ b/man/do.Rd
@@ -1,16 +1,10 @@
% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/do.r, R/tbl-sql.r
+% Please edit documentation in R/do.r
\name{do}
\alias{do}
-\alias{do_}
-\alias{do_.tbl_sql}
-\title{Do arbitrary operations on a tbl.}
+\title{Do anything}
\usage{
do(.data, ...)
-
-do_(.data, ..., .dots)
-
-\method{do_}{tbl_sql}(.data, ..., .dots, .chunk_size = 10000L)
}
\arguments{
\item{.data}{a tbl}
@@ -19,37 +13,29 @@ do_(.data, ..., .dots)
stored in a new column. If unnamed, should return a data frame. You can
use \code{.} to refer to the current group. You can not mix named and
unnamed arguments.}
-
-\item{.dots}{Used to work around non-standard evaluation. See
-\code{vignette("nse")} for details.}
-
-\item{.chunk_size}{The size of each chunk to pull into R. If this number is
-too big, the process will be slow because R has to allocate and free a lot
-of memory. If it's too small, it will be slow, because of the overhead of
-talking to the database.}
}
\value{
-\code{do} always returns a data frame. The first columns in the data frame
+\code{do()} always returns a data frame. The first columns in the data frame
will be the labels, the others will be computed from \code{...}. Named
arguments become list-columns, with one element for each group; unnamed
elements must be data frames and labels will be duplicated accordingly.
Groups are preserved for a single unnamed input. This is different to
-\code{\link{summarise}} because \code{do} generally does not reduce the
+\code{\link[=summarise]{summarise()}} because \code{do()} generally does not reduce the
complexity of the data, it just expresses it in a special way. For
multiple named inputs, the output is grouped by row with
-\code{\link{rowwise}}. This allows other verbs to work in an intuitive
+\code{\link[=rowwise]{rowwise()}}. This allows other verbs to work in an intuitive
way.
}
\description{
This is a general purpose complement to the specialised manipulation
-functions \code{\link{filter}}, \code{\link{select}}, \code{\link{mutate}},
-\code{\link{summarise}} and \code{\link{arrange}}. You can use \code{do}
+functions \code{\link[=filter]{filter()}}, \code{\link[=select]{select()}}, \code{\link[=mutate]{mutate()}},
+\code{\link[=summarise]{summarise()}} and \code{\link[=arrange]{arrange()}}. You can use \code{do()}
to perform arbitrary computation, returning either a data frame or
arbitrary objects which will be stored in a list. This is particularly
useful when working with models: you can fit models per group with
-\code{do} and then flexibly extract components with either another
-\code{do} or \code{summarise}.
+\code{do()} and then flexibly extract components with either another
+\code{do()} or \code{summarise()}.
}
\details{
For an empty data frame, the expressions will be evaluated once, even in the
@@ -59,13 +45,14 @@ data frame is the same for both empty and non-empty input.
\section{Connection to plyr}{
-If you're familiar with plyr, \code{do} with named arguments is basically
-equivalent to \code{dlply}, and \code{do} with a single unnamed argument
-is basically equivalent to \code{ldply}. However, instead of storing
+If you're familiar with plyr, \code{do()} with named arguments is basically
+equivalent to \code{\link[plyr:dlply]{plyr::dlply()}}, and \code{do()} with a single unnamed argument
+is basically equivalent to \code{\link[plyr:ldply]{plyr::ldply()}}. However, instead of storing
labels in a separate attribute, the result is always a data frame. This
-means that \code{summarise} applied to the result of \code{do} can
-act like \code{ldply}.
+means that \code{summarise()} applied to the result of \code{do()} can
+act like \code{ldply()}.
}
+
\examples{
by_cyl <- group_by(mtcars, cyl)
do(by_cyl, head(., 2))
@@ -106,4 +93,3 @@ by_dest \%>\% do(smooth = gam(arr_delay ~ s(dep_time) + month, data = .))
}
}
}
-
diff --git a/man/dplyr-package.Rd b/man/dplyr-package.Rd
new file mode 100644
index 0000000..f2ee0c1
--- /dev/null
+++ b/man/dplyr-package.Rd
@@ -0,0 +1,72 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dplyr.r
+\docType{package}
+\name{dplyr-package}
+\alias{dplyr}
+\alias{dplyr-package}
+\title{dplyr: a grammar of data manipulation}
+\description{
+dplyr provides a flexible grammar of data manipulation. It's the next
+iteration of plyr, focused on tools for working with data frames (hence the
+\emph{d} in the name).
+}
+\details{
+It has three main goals:
+
+\itemize{
+\item Identify the most important data manipulation verbs and make them
+easy to use from R.
+\item Provide blazing fast performance for in-memory data by writing key
+pieces in C++ (using Rcpp)
+\item Use the same interface to work with data no matter where it's stored,
+whether in a data frame, a data table or database.
+}
+
+To learn more about dplyr, start with the vignettes:
+\code{browseVignettes(package = "dplyr")}
+}
+\section{Package options}{
+
+\describe{
+\item{\code{dplyr.show_progress}}{Should lengthy operations such as \code{do()}
+show a progress bar? Default: \code{TRUE}}
+}
+}
+
+\section{Package configurations}{
+
+These can be set on a package-by-package basis, or for the global environment.
+See \code{\link[pkgconfig:set_config]{pkgconfig::set_config()}} for usage.
+\describe{
+\item{\code{dplyr::na_matches}}{Should \code{NA} values be matched in data frame joins
+by default? Default: \code{"na"} (for compatibility with dplyr v0.5.0 and earlier,
+subject to change), alternative value: \code{"never"} (the default
+for database backends, see \code{\link[=join.tbl_df]{join.tbl_df()}}).}
+}
+}
+
+\seealso{
+Useful links:
+\itemize{
+ \item \url{http://dplyr.tidyverse.org}
+ \item \url{https://github.com/tidyverse/dplyr}
+ \item Report bugs at \url{https://github.com/tidyverse/dplyr/issues}
+}
+
+}
+\author{
+\strong{Maintainer}: Hadley Wickham \email{hadley at rstudio.com}
+
+Authors:
+\itemize{
+ \item Romain Francois \email{romain at r-enthusiasts.com}
+ \item Lionel Henry
+ \item Kirill Müller
+}
+
+Other contributors:
+\itemize{
+ \item RStudio [copyright holder, funder]
+}
+
+}
diff --git a/man/dplyr.Rd b/man/dplyr.Rd
deleted file mode 100644
index f05cdce..0000000
--- a/man/dplyr.Rd
+++ /dev/null
@@ -1,28 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/dplyr.r
-\docType{package}
-\name{dplyr}
-\alias{dplyr}
-\alias{dplyr-package}
-\title{dplyr: a grammar of data manipulation}
-\description{
-dplyr provides a flexible grammar of data manipulation. It's the next
-iteration of plyr, focused on tools for working with data frames (hence the
-\emph{d} in the name).
-}
-\details{
-It has three main goals:
-
-\itemize{
-\item Identify the most important data manipulation verbs and make them
- easy to use from R.
-\item Provide blazing fast performance for in-memory data by writing key
- pieces in C++ (using Rcpp)
-\item Use the same interface to work with data no matter where it's stored,
- whether in a data frame, a data table or database.
-}
-
-To learn more about dplyr, start with the vignettes:
-\code{browseVignettes(package = "dplyr")}
-}
-
diff --git a/man/dr_dplyr.Rd b/man/dr_dplyr.Rd
new file mode 100644
index 0000000..f4e41a5
--- /dev/null
+++ b/man/dr_dplyr.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dr.R
+\name{dr_dplyr}
+\alias{dr_dplyr}
+\title{Dr Dplyr checks your installation for common problems.}
+\usage{
+dr_dplyr()
+}
+\description{
+Only run this if you are seeing problems, like random crashes.
+It's possible for \code{dr_dplyr} to return false positives, so there's no
+need to run if all is ok.
+}
+\examples{
+\dontrun{
+dr_dplyr()
+}
+}
diff --git a/man/explain.Rd b/man/explain.Rd
index 25101b5..5b21c51 100644
--- a/man/explain.Rd
+++ b/man/explain.Rd
@@ -3,21 +3,24 @@
\name{explain}
\alias{explain}
\alias{show_query}
-\title{Explain details of a tbl.}
+\title{Explain details of a tbl}
\usage{
explain(x, ...)
-show_query(x)
+show_query(x, ...)
}
\arguments{
\item{x}{An object to explain}
\item{...}{Other parameters possibly used by generic}
}
+\value{
+The first argument, invisibly.
+}
\description{
This is a generic function which gives more details about an object than
-\code{\link{print}}, and is more focussed on human readable output than
-\code{\link{str}}.
+\code{\link[=print]{print()}}, and is more focused on human readable output than
+\code{\link[=str]{str()}}.
}
\section{Databases}{
@@ -26,9 +29,10 @@ will describe the query plan. This requires a little bit of knowledge about
how \code{EXPLAIN} works for your database, but is very useful for
diagnosing performance problems.
}
+
\examples{
\donttest{
-if (require("RSQLite") && has_lahman("sqlite")) {
+if (require("dbplyr")) {
lahman_s <- lahman_sqlite()
batting <- tbl(lahman_s, "Batting")
@@ -48,4 +52,3 @@ batting \%>\% left_join(teams, c("yearID", "teamID")) \%>\% explain()
}
}
}
-
diff --git a/man/failwith.Rd b/man/failwith.Rd
index de9f748..6ff8247 100644
--- a/man/failwith.Rd
+++ b/man/failwith.Rd
@@ -17,22 +17,9 @@ failwith(default = NULL, f, quiet = FALSE)
a function
}
\description{
-Modify a function so that it returns a default value when there is an
-error.
-}
-\examples{
-f <- function(x) if (x == 1) stop("Error!") else 1
-\dontrun{
-f(1)
-f(2)
-}
-
-safef <- failwith(NULL, f)
-safef(1)
-safef(2)
+Deprecated. Please use \code{\link[purrr:possibly]{purrr::possibly()}} instead.
}
\seealso{
-\code{\link[plyr]{try_default}}
+\code{\link[plyr:try_default]{plyr::try_default()}}
}
-\keyword{debugging}
-
+\keyword{internal}
diff --git a/man/figures/logo.png b/man/figures/logo.png
new file mode 100644
index 0000000..10f24c4
Binary files /dev/null and b/man/figures/logo.png differ
diff --git a/man/filter.Rd b/man/filter.Rd
index eef8d26..16d9823 100644
--- a/man/filter.Rd
+++ b/man/filter.Rd
@@ -2,45 +2,74 @@
% Please edit documentation in R/manip.r
\name{filter}
\alias{filter}
-\alias{filter_}
-\title{Return rows with matching conditions.}
+\title{Return rows with matching conditions}
\usage{
filter(.data, ...)
-
-filter_(.data, ..., .dots)
}
\arguments{
\item{.data}{A tbl. All main verbs are S3 generics and provide methods
-for \code{\link{tbl_df}}, \code{\link[dtplyr]{tbl_dt}} and \code{\link{tbl_sql}}.}
+for \code{\link[=tbl_df]{tbl_df()}}, \code{\link[dtplyr:tbl_dt]{dtplyr::tbl_dt()}} and \code{\link[dbplyr:tbl_dbi]{dbplyr::tbl_dbi()}}.}
-\item{...}{Logical predicates. Multiple conditions are combined with \code{&}.}
+\item{...}{Logical predicates defined in terms of the variables in \code{.data}.
+Multiple conditions are combined with \code{&}. Only rows where the
+condition evaluates to \code{TRUE} are kept.
-\item{.dots}{Used to work around non-standard evaluation. See
-\code{vignette("nse")} for details.}
+These arguments are automatically \link[rlang:quo]{quoted} and
+\link[rlang:eval_tidy]{evaluated} in the context of the data
+frame. They support \link[rlang:quasiquotation]{unquoting} and
+splicing. See \code{vignette("programming")} for an introduction to
+these concepts.}
}
\value{
An object of the same class as \code{.data}.
-
- Data frame row names are silently dropped. To preserve, convert to an
- explicit variable.
}
\description{
-Return rows with matching conditions.
+Use \code{filter()} find rows/cases where conditions are true. Unlike
+base subsetting, rows where the condition evaluates to \code{NA} are dropped.
+}
+\details{
+Note that dplyr is not yet smart enough to optimise filtering optimisation
+on grouped datasets that don't need grouped calculations. For this reason,
+filtering is often considerably faster on \code{\link[=ungroup]{ungroup()}}ed data.
+}
+\section{Useful filter functions}{
+
+\itemize{
+\item \code{\link{==}}, \code{\link{>}}, \code{\link{>=}} etc
+\item \code{\link{&}}, \code{\link{|}}, \code{\link{!}}, \code{\link[=xor]{xor()}}
+\item \code{\link[=is.na]{is.na()}}
+\item \code{\link[=between]{between()}}, \code{\link[=near]{near()}}
+}
}
+
+\section{Tidy data}{
+
+When applied to a data frame, row names are silently dropped. To preserve,
+convert to an explicit variable with \code{\link[tibble:rownames_to_column]{tibble::rownames_to_column()}}.
+}
+
+\section{Scoped filtering}{
+
+The three \link{scoped} variants (\code{\link[=filter_all]{filter_all()}}, \code{\link[=filter_if]{filter_if()}} and
+\code{\link[=filter_at]{filter_at()}}) make it easy to apply a filtering condition to a
+selection of variables.
+}
+
\examples{
-filter(mtcars, cyl == 8)
-filter(mtcars, cyl < 6)
+filter(starwars, species == "Human")
+filter(starwars, mass > 1000)
# Multiple criteria
-filter(mtcars, cyl < 6 & vs == 1)
-filter(mtcars, cyl < 6 | vs == 1)
+filter(starwars, hair_color == "none" & eye_color == "black")
+filter(starwars, hair_color == "none" | eye_color == "black")
# Multiple arguments are equivalent to and
-filter(mtcars, cyl < 6, vs == 1)
+filter(starwars, hair_color == "none", eye_color == "black")
}
\seealso{
-Other single.table.verbs: \code{\link{arrange}},
+\code{\link[=filter_all]{filter_all()}}, \code{\link[=filter_if]{filter_if()}} and \code{\link[=filter_at]{filter_at()}}.
+
+Other single table verbs: \code{\link{arrange}},
\code{\link{mutate}}, \code{\link{select}},
\code{\link{slice}}, \code{\link{summarise}}
}
-
diff --git a/man/filter_all.Rd b/man/filter_all.Rd
new file mode 100644
index 0000000..ee00836
--- /dev/null
+++ b/man/filter_all.Rd
@@ -0,0 +1,59 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/colwise-filter.R
+\name{filter_all}
+\alias{filter_all}
+\alias{filter_if}
+\alias{filter_at}
+\title{Filter within a selection of variables}
+\usage{
+filter_all(.tbl, .vars_predicate)
+
+filter_if(.tbl, .predicate, .vars_predicate)
+
+filter_at(.tbl, .vars, .vars_predicate)
+}
+\arguments{
+\item{.tbl}{A \code{tbl} object.}
+
+\item{.vars_predicate}{A quoted predicate expression as returned by
+\code{\link[=all_vars]{all_vars()}} or \code{\link[=any_vars]{any_vars()}}.}
+
+\item{.predicate}{A predicate function to be applied to the columns
+or a logical vector. The variables for which \code{.predicate} is or
+returns \code{TRUE} are selected. This argument is passed to
+\code{\link[rlang:as_function]{rlang::as_function()}} and thus supports quosure-style lambda
+functions and strings representing function names.}
+
+\item{.vars}{A list of columns generated by \code{\link[=vars]{vars()}},
+or a character vector of column names, or a numeric vector of column
+positions.}
+}
+\description{
+These \link{scoped} filtering verbs apply a predicate expression to a
+selection of variables. The predicate expression should be quoted
+with \code{\link[=all_vars]{all_vars()}} or \code{\link[=any_vars]{any_vars()}} and should mention the pronoun
+\code{.} to refer to variables.
+}
+\examples{
+# While filter() accepts expressions with specific variables, the
+# scoped filter verbs take an expression with the pronoun `.` and
+# replicate it over all variables. This expression should be quoted
+# with all_vars() or any_vars():
+all_vars(is.na(.))
+any_vars(is.na(.))
+
+
+# You can take the intersection of the replicated expressions:
+filter_all(mtcars, all_vars(. > 150))
+
+# Or the union:
+filter_all(mtcars, any_vars(. > 150))
+
+
+# You can vary the selection of columns on which to apply the
+# predicate. filter_at() takes a vars() specification:
+filter_at(mtcars, vars(starts_with("d")), any_vars((. \%\% 2) == 0))
+
+# And filter_if() selects variables with a predicate function:
+filter_if(mtcars, ~ all(floor(.) == .), all_vars(. != 0))
+}
diff --git a/man/funs.Rd b/man/funs.Rd
index e657e59..52b58d0 100644
--- a/man/funs.Rd
+++ b/man/funs.Rd
@@ -2,40 +2,37 @@
% Please edit documentation in R/funs.R
\name{funs}
\alias{funs}
-\alias{funs_}
\title{Create a list of functions calls.}
\usage{
-funs(...)
-
-funs_(dots, args = list(), env = baseenv())
+funs(..., .args = list())
}
\arguments{
-\item{dots, ...}{A list of functions specified by:
-
- \itemize{
- \item Their name, \code{"mean"}
- \item The function itself, \code{mean}
- \item A call to the function with \code{.} as a dummy parameter,
- \code{mean(., na.rm = TRUE)}
- }}
+\item{...}{A list of functions specified by:
+\itemize{
+\item Their name, \code{"mean"}
+\item The function itself, \code{mean}
+\item A call to the function with \code{.} as a dummy argument,
+\code{mean(., na.rm = TRUE)}
+}
-\item{args}{A named list of additional arguments to be added to all
-function calls.}
+These arguments are automatically \link[rlang:quo]{quoted}. They
+support \link[rlang:quasiquotation]{unquoting} and splicing. See
+\code{vignette("programming")} for an introduction to these concepts.}
-\item{env}{The environment in which functions should be evaluated.}
+\item{.args, args}{A named list of additional arguments to be added
+to all function calls.}
}
\description{
-\code{funs} provides a flexible way to generate a named list of functions for
-input to other functions like \code{summarise_each}.
+\code{funs()} provides a flexible way to generate a named list of
+functions for input to other functions like \code{\link[=summarise_at]{summarise_at()}}.
}
\examples{
funs(mean, "mean", mean(., na.rm = TRUE))
-# Overide default names
+# Override default names
funs(m1 = mean, m2 = "mean", m3 = mean(., na.rm = TRUE))
# If you have function names in a vector, use funs_
fs <- c("min", "max")
funs_(fs)
}
-
diff --git a/man/group_by.Rd b/man/group_by.Rd
index 284570f..e4eb537 100644
--- a/man/group_by.Rd
+++ b/man/group_by.Rd
@@ -2,79 +2,91 @@
% Please edit documentation in R/group-by.r
\name{group_by}
\alias{group_by}
-\alias{group_by_}
-\alias{regroup}
-\title{Group a tbl by one or more variables.}
+\alias{ungroup}
+\title{Group by one or more variables}
\usage{
group_by(.data, ..., add = FALSE)
-group_by_(.data, ..., .dots, add = FALSE)
+ungroup(x, ...)
}
\arguments{
\item{.data}{a tbl}
-\item{...}{variables to group by. All tbls accept variable names,
-some will also accept functions of variables. Duplicated groups
+\item{...}{Variables to group by. All tbls accept variable names.
+Some tbls will accept functions of variables. Duplicated groups
will be silently dropped.}
-\item{add}{By default, when \code{add = FALSE}, \code{group_by} will
-override existing groups. To instead add to the existing groups,
-use \code{add = TRUE}}
+\item{add}{When \code{add = FALSE}, the default, \code{group_by()} will
+override existing groups. To add to the existing groups, use
+\code{add = TRUE}.}
-\item{.dots}{Used to work around non-standard evaluation. See
-\code{vignette("nse")} for details.}
+\item{x}{A \code{\link[=tbl]{tbl()}}}
}
\description{
-Most data operations are useful done on groups defined by variables in the
-the dataset. The \code{group_by} function takes an existing tbl
-and converts it into a grouped tbl where operations are performed
-"by group".
+Most data operations are done on groups defined by variables.
+\code{group_by()} takes an existing tbl and converts it into a grouped tbl
+where operations are performed "by group". \code{ungroup()} removes grouping.
}
\section{Tbl types}{
-\code{group_by} is an S3 generic with methods for the three built-in
+\code{group_by()} is an S3 generic with methods for the three built-in
tbls. See the help for the corresponding classes and their manip
methods for more details:
\itemize{
- \item data.frame: \link{grouped_df}
- \item data.table: \link[dtplyr]{grouped_dt}
- \item SQLite: \code{\link{src_sqlite}}
- \item PostgreSQL: \code{\link{src_postgres}}
- \item MySQL: \code{\link{src_mysql}}
+\item data.frame: \link{grouped_df}
+\item data.table: \link[dtplyr:grouped_dt]{dtplyr::grouped_dt}
+\item SQLite: \code{\link[=src_sqlite]{src_sqlite()}}
+\item PostgreSQL: \code{\link[=src_postgres]{src_postgres()}}
+\item MySQL: \code{\link[=src_mysql]{src_mysql()}}
}
}
+
+\section{Scoped grouping}{
+
+
+The three \link{scoped} variants (\code{\link[=group_by_all]{group_by_all()}}, \code{\link[=group_by_if]{group_by_if()}} and
+\code{\link[=group_by_at]{group_by_at()}}) make it easy to group a dataset by a selection of
+variables.
+}
+
\examples{
-by_cyl <- group_by(mtcars, cyl)
-summarise(by_cyl, mean(disp), mean(hp))
-filter(by_cyl, disp == max(disp))
+by_cyl <- mtcars \%>\% group_by(cyl)
+
+# grouping doesn't change how the data looks (apart from listing
+# how it's grouped):
+by_cyl
-# summarise peels off a single layer of grouping
-by_vs_am <- group_by(mtcars, vs, am)
-by_vs <- summarise(by_vs_am, n = n())
+# It changes how it acts with the other dplyr verbs:
+by_cyl \%>\% summarise(
+ disp = mean(disp),
+ hp = mean(hp)
+)
+by_cyl \%>\% filter(disp == max(disp))
+
+# Each call to summarise() removes a layer of grouping
+by_vs_am <- mtcars \%>\% group_by(vs, am)
+by_vs <- by_vs_am \%>\% summarise(n = n())
by_vs
-summarise(by_vs, n = sum(n))
-# use ungroup() to remove if not wanted
-summarise(ungroup(by_vs), n = sum(n))
+by_vs \%>\% summarise(n = sum(n))
+
+# To removing grouping, use ungroup
+by_vs \%>\%
+ ungroup() \%>\%
+ summarise(n = sum(n))
# You can group by expressions: this is just short-hand for
# a mutate/rename followed by a simple group_by
-group_by(mtcars, vsam = vs + am)
-group_by(mtcars, vs2 = vs)
-
-# You can also group by a constant, but it's not very useful
-group_by(mtcars, "vs")
+mtcars \%>\% group_by(vsam = vs + am)
-# By default, group_by sets groups. Use add = TRUE to add groups
-groups(group_by(by_cyl, vs, am))
-groups(group_by(by_cyl, vs, am, add = TRUE))
+# By default, group_by overrides existing grouping
+by_cyl \%>\%
+ group_by(vs, am) \%>\%
+ group_vars()
-# Duplicate groups are silently dropped
-groups(group_by(by_cyl, cyl, cyl))
+# Use add = TRUE to instead append
+by_cyl \%>\%
+ group_by(vs, am, add = TRUE) \%>\%
+ group_vars()
}
-\seealso{
-\code{\link{ungroup}} for the inverse operation,
- \code{\link{groups}} for accessors that don't do special evaluation.
-}
-
diff --git a/man/group_by_all.Rd b/man/group_by_all.Rd
new file mode 100644
index 0000000..0d06087
--- /dev/null
+++ b/man/group_by_all.Rd
@@ -0,0 +1,62 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/colwise-group-by.R
+\name{group_by_all}
+\alias{group_by_all}
+\alias{group_by_at}
+\alias{group_by_if}
+\title{Group by a selection of variables}
+\usage{
+group_by_all(.tbl, .funs = list(), ...)
+
+group_by_at(.tbl, .vars, .funs = list(), ..., .add = FALSE)
+
+group_by_if(.tbl, .predicate, .funs = list(), ..., .add = FALSE)
+}
+\arguments{
+\item{.tbl}{A \code{tbl} object.}
+
+\item{.funs}{List of function calls generated by \code{\link[=funs]{funs()}}, or a
+character vector of function names, or simply a function.
+
+Bare formulas are passed to \code{\link[rlang:as_function]{rlang::as_function()}} to create
+purrr-style lambda functions. Note that these lambda prevent
+hybrid evaluation from happening and it is thus more efficient to
+supply functions like \code{mean()} directly rather than in a
+lambda-formula.}
+
+\item{...}{Additional arguments for the function calls in
+\code{.funs}. These are evaluated only once, with \link[rlang:dots_list]{explicit
+splicing}.}
+
+\item{.vars}{A list of columns generated by \code{\link[=vars]{vars()}},
+or a character vector of column names, or a numeric vector of column
+positions.}
+
+\item{.add}{Passed to the \code{add} argument of \code{\link[=group_by]{group_by()}}.}
+
+\item{.predicate}{A predicate function to be applied to the columns
+or a logical vector. The variables for which \code{.predicate} is or
+returns \code{TRUE} are selected. This argument is passed to
+\code{\link[rlang:as_function]{rlang::as_function()}} and thus supports quosure-style lambda
+functions and strings representing function names.}
+}
+\description{
+These \link{scoped} variants of \code{\link[=group_by]{group_by()}} group a data frame by a
+selection of variables. Like \code{\link[=group_by]{group_by()}}, they have optional
+\link{mutate} semantics.
+}
+\examples{
+# Group a data frame by all variables:
+group_by_all(mtcars)
+
+# Group by variables selected with a predicate:
+group_by_if(iris, is.factor)
+
+# Group by variables selected by name:
+group_by_at(mtcars, vars(vs, am))
+
+# Like group_by(), the scoped variants have optional mutate
+# semantics. This provide a shortcut for group_by() + mutate():
+group_by_all(mtcars, as.factor)
+group_by_if(iris, is.factor, as.character)
+}
diff --git a/man/group_by_prepare.Rd b/man/group_by_prepare.Rd
index 59d8c86..a59e12a 100644
--- a/man/group_by_prepare.Rd
+++ b/man/group_by_prepare.Rd
@@ -4,12 +4,12 @@
\alias{group_by_prepare}
\title{Prepare for grouping.}
\usage{
-group_by_prepare(.data, ..., .dots, add = FALSE)
+group_by_prepare(.data, ..., .dots = list(), add = FALSE)
}
\value{
A list
- \item{data}{Modified tbl}
- \item{groups}{Modified groups}
+\item{data}{Modified tbl}
+\item{groups}{Modified groups}
}
\description{
Performs standard operations that should happen before individual methods
@@ -17,4 +17,3 @@ process the data. This includes mutating the tbl to add new grouping columns
and updating the groups (based on add)
}
\keyword{internal}
-
diff --git a/man/group_indices.Rd b/man/group_indices.Rd
index 4e1ed48..2a7ecde 100644
--- a/man/group_indices.Rd
+++ b/man/group_indices.Rd
@@ -2,22 +2,16 @@
% Please edit documentation in R/group-indices.R
\name{group_indices}
\alias{group_indices}
-\alias{group_indices_}
\title{Group id.}
\usage{
group_indices(.data, ...)
-
-group_indices_(.data, ..., .dots)
}
\arguments{
\item{.data}{a tbl}
-\item{...}{variables to group by. All tbls accept variable names,
-some will also accept functions of variables. Duplicated groups
+\item{...}{Variables to group by. All tbls accept variable names.
+Some tbls will accept functions of variables. Duplicated groups
will be silently dropped.}
-
-\item{.dots}{Used to work around non-standard evaluation. See
-\code{vignette("nse")} for details.}
}
\description{
Generate a unique id for each group
@@ -26,6 +20,6 @@ Generate a unique id for each group
group_indices(mtcars, cyl)
}
\seealso{
-\code{\link{group_by}}
+\code{\link[=group_by]{group_by()}}
}
-
+\keyword{internal}
diff --git a/man/group_size.Rd b/man/group_size.Rd
index 493d193..88dba42 100644
--- a/man/group_size.Rd
+++ b/man/group_size.Rd
@@ -27,4 +27,4 @@ n_groups(by_dest)
group_size(by_dest)
}
}
-
+\keyword{internal}
diff --git a/man/grouped_df.Rd b/man/grouped_df.Rd
index a556348..448a0bb 100644
--- a/man/grouped_df.Rd
+++ b/man/grouped_df.Rd
@@ -1,42 +1,28 @@
% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/dataframe.R, R/grouped-df.r
+% Please edit documentation in R/grouped-df.r
\name{grouped_df}
-\alias{as_data_frame.grouped_df}
\alias{grouped_df}
\alias{is.grouped_df}
-\title{Convert to a data frame}
+\alias{is_grouped_df}
+\title{A grouped data frame.}
\usage{
-\method{as_data_frame}{grouped_df}(x, ...)
-
grouped_df(data, vars, drop = TRUE)
is.grouped_df(x)
+
+is_grouped_df(x)
}
\arguments{
-\item{x}{A list. Each element of the list must have the same length.}
-
-\item{...}{Other arguments passed on to individual methods.}
-
\item{data}{a tbl or data frame.}
-\item{vars}{a list of quoted variables.}
+\item{vars}{a character vector or a list of \code{\link[=name]{name()}}}
\item{drop}{if \code{TRUE} preserve all factor levels, even those without
data.}
}
\description{
-Functions that convert the input to a \code{data_frame}.
-
-The easiest way to create a grouped data frame is to call the \code{group_by}
+The easiest way to create a grouped data frame is to call the \code{group_by()}
method on a data frame or tbl: this will take care of capturing
-the unevalated expressions for you.
-}
-\details{
-For a grouped data frame, the \code{\link[tibble]{as_data_frame}}
-S3 generic simply removes the grouping.
-}
-\seealso{
-\code{\link[tibble]{as_data_frame}}
+the unevaluated expressions for you.
}
\keyword{internal}
-
diff --git a/man/groups.Rd b/man/groups.Rd
index 0deecd5..b846e83 100644
--- a/man/groups.Rd
+++ b/man/groups.Rd
@@ -2,26 +2,22 @@
% Please edit documentation in R/group-by.r
\name{groups}
\alias{groups}
-\alias{ungroup}
-\title{Get/set the grouping variables for tbl.}
+\alias{group_vars}
+\title{Return grouping variables}
\usage{
groups(x)
-ungroup(x, ...)
+group_vars(x)
}
\arguments{
-\item{x}{data \code{\link{tbl}}}
-
-\item{...}{Additional arguments that maybe used by methods.}
+\item{x}{A \code{\link[=tbl]{tbl()}}}
}
\description{
-These functions do not perform non-standard evaluation, and so are useful
-when programming against \code{tbl} objects. \code{ungroup} is a convenient
-inline way of removing existing grouping.
+\code{group_vars()} returns a character vector; \code{groups()} returns a list of
+symbols.
}
\examples{
-grouped <- group_by(mtcars, cyl)
-groups(grouped)
-groups(ungroup(grouped))
+df <- tibble(x = 1, y = 2) \%>\% group_by(x, y)
+group_vars(df)
+groups(df)
}
-
diff --git a/man/id.Rd b/man/id.Rd
index c995376..f1c7da6 100644
--- a/man/id.Rd
+++ b/man/id.Rd
@@ -9,19 +9,18 @@ id(.variables, drop = FALSE)
\arguments{
\item{.variables}{list of variables}
-\item{drop}{drop unusued factor levels?}
+\item{drop}{drop unused factor levels?}
}
\value{
a numeric vector with attribute n, giving total number of
- possibilities
+possibilities
}
\description{
Properties:
\itemize{
- \item \code{order(id)} is equivalent to \code{do.call(order, df)}
- \item rows containing the same data have the same value
- \item if \code{drop = FALSE} then room for all possibilites
+\item \code{order(id)} is equivalent to \code{do.call(order, df)}
+\item rows containing the same data have the same value
+\item if \code{drop = FALSE} then room for all possibilities
}
}
\keyword{internal}
-
diff --git a/man/ident.Rd b/man/ident.Rd
new file mode 100644
index 0000000..8c91362
--- /dev/null
+++ b/man/ident.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compat-dbplyr.R
+\name{ident}
+\alias{ident}
+\title{Flag a character vector as SQL identifiers}
+\usage{
+ident(...)
+}
+\arguments{
+\item{...}{A character vector, or name-value pairs}
+}
+\description{
+\code{ident()} takes unquoted strings and quotes them for you; \code{ident_q()}
+assumes its input has already been quoted.
+}
+\details{
+These two \code{ident} clsases are used during SQL generation to make sure
+the values will be quoted as, not as strings.
+}
diff --git a/man/if_else.Rd b/man/if_else.Rd
index c405e74..d596355 100644
--- a/man/if_else.Rd
+++ b/man/if_else.Rd
@@ -2,7 +2,7 @@
% Please edit documentation in R/if_else.R
\name{if_else}
\alias{if_else}
-\title{Vectorised if.}
+\title{Vectorised if}
\usage{
if_else(condition, true, false, missing = NULL)
}
@@ -11,7 +11,7 @@ if_else(condition, true, false, missing = NULL)
\item{true, false}{Values to use for \code{TRUE} and \code{FALSE} values of
\code{condition}. They must be either the same length as \code{condition},
-or length 1. They must also be the same type: \code{if_else} checks that
+or length 1. They must also be the same type: \code{if_else()} checks that
they have the same type and same class. All other attributes are
taken from \code{true}.}
@@ -20,11 +20,11 @@ values.}
}
\value{
Where \code{condition} is \code{TRUE}, the matching value from
- \code{true}, where it's \code{FALSE}, the matching value from \code{false},
- otherwise \code{NA}.
+\code{true}, where it's \code{FALSE}, the matching value from \code{false},
+otherwise \code{NA}.
}
\description{
-Compared to the base \code{\link{ifelse}()}, this function is more strict.
+Compared to the base \code{\link[=ifelse]{ifelse()}}, this function is more strict.
It checks that \code{true} and \code{false} are the same type. This
strictness makes the output type more predictable, and makes it somewhat
faster.
@@ -40,4 +40,3 @@ ifelse(x \%in\% c("a", "b", "c"), x, factor(NA))
if_else(x \%in\% c("a", "b", "c"), x, factor(NA))
# Attributes are taken from the `true` vector,
}
-
diff --git a/man/init_logging.Rd b/man/init_logging.Rd
new file mode 100644
index 0000000..e9510f4
--- /dev/null
+++ b/man/init_logging.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/RcppExports.R
+\name{init_logging}
+\alias{init_logging}
+\title{Enable internal logging}
+\usage{
+init_logging(log_level)
+}
+\arguments{
+\item{log_level}{A character value, one of "WARN", "INFO", "DEBUG", "VERB",
+or "NONE".}
+}
+\description{
+Log entries, depending on the log level, will be printed to the standard
+error stream.
+}
+\keyword{internal}
diff --git a/man/join.Rd b/man/join.Rd
index 92622ba..1b85b23 100644
--- a/man/join.Rd
+++ b/man/join.Rd
@@ -1,14 +1,14 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/join.r
\name{join}
-\alias{anti_join}
-\alias{full_join}
-\alias{inner_join}
\alias{join}
+\alias{inner_join}
\alias{left_join}
\alias{right_join}
+\alias{full_join}
\alias{semi_join}
-\title{Join two tbls together.}
+\alias{anti_join}
+\title{Join two tbls together}
\usage{
inner_join(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...)
@@ -26,14 +26,14 @@ anti_join(x, y, by = NULL, copy = FALSE, ...)
\item{x, y}{tbls to join}
\item{by}{a character vector of variables to join by. If \code{NULL}, the
- default, \code{join} will do a natural join, using all variables with
- common names across the two tables. A message lists the variables so
- that you can check they're right (to suppress the message, simply
- explicitly list the variables that you want to join).
+default, \code{*_join()} will do a natural join, using all variables with
+common names across the two tables. A message lists the variables so
+that you can check they're right (to suppress the message, simply
+explicitly list the variables that you want to join).
- To join by different variables on x and y use a named vector.
- For example, \code{by = c("a" = "b")} will match \code{x.a} to
- \code{y.b}.}
+To join by different variables on x and y use a named vector.
+For example, \code{by = c("a" = "b")} will match \code{x.a} to
+\code{y.b}.}
\item{copy}{If \code{x} and \code{y} are not from the same data source,
and \code{copy} is \code{TRUE}, then \code{y} will be copied into the
@@ -41,7 +41,8 @@ same src as \code{x}. This allows you to join tables across srcs, but
it is a potentially expensive operation so you must opt into it.}
\item{suffix}{If there are non-joined duplicate variables in \code{x} and
-\code{y}, these suffixes will be added to the output to diambiguate them.}
+\code{y}, these suffixes will be added to the output to disambiguate them.
+Should be a character vector of length 2.}
\item{...}{other parameters passed onto methods}
}
@@ -49,8 +50,7 @@ it is a potentially expensive operation so you must opt into it.}
These are generic functions that dispatch to individual tbl methods - see the
method documentation for details of individual data sources. \code{x} and
\code{y} should usually be from the same data source, but if \code{copy} is
-\code{TRUE}, \code{y} will automatically be copied to the same source as
-\code{x} - this may be an expensive operation.
+\code{TRUE}, \code{y} will automatically be copied to the same source as \code{x}.
}
\section{Join types}{
@@ -58,32 +58,32 @@ method documentation for details of individual data sources. \code{x} and
Currently dplyr supports four join types:
\describe{
- \item{\code{inner_join}}{return all rows from \code{x} where there are matching
- values in \code{y}, and all columns from \code{x} and \code{y}. If there are multiple matches
- between \code{x} and \code{y}, all combination of the matches are returned.}
+\item{\code{inner_join()}}{return all rows from \code{x} where there are matching
+values in \code{y}, and all columns from \code{x} and \code{y}. If there are multiple matches
+between \code{x} and \code{y}, all combination of the matches are returned.}
- \item{\code{left_join}}{return all rows from \code{x}, and all columns from \code{x}
- and \code{y}. Rows in \code{x} with no match in \code{y} will have \code{NA} values in the new
- columns. If there are multiple matches between \code{x} and \code{y}, all combinations
- of the matches are returned.}
+\item{\code{left_join()}}{return all rows from \code{x}, and all columns from \code{x}
+and \code{y}. Rows in \code{x} with no match in \code{y} will have \code{NA} values in the new
+columns. If there are multiple matches between \code{x} and \code{y}, all combinations
+of the matches are returned.}
- \item{\code{right_join}}{return all rows from \code{y}, and all columns from \code{x}
- and y. Rows in \code{y} with no match in \code{x} will have \code{NA} values in the new
- columns. If there are multiple matches between \code{x} and \code{y}, all combinations
- of the matches are returned.}
+\item{\code{right_join()}}{return all rows from \code{y}, and all columns from \code{x}
+and y. Rows in \code{y} with no match in \code{x} will have \code{NA} values in the new
+columns. If there are multiple matches between \code{x} and \code{y}, all combinations
+of the matches are returned.}
- \item{\code{semi_join}}{return all rows from \code{x} where there are matching
- values in \code{y}, keeping just columns from \code{x}.
+\item{\code{semi_join()}}{return all rows from \code{x} where there are matching
+values in \code{y}, keeping just columns from \code{x}.
- A semi join differs from an inner join because an inner join will return
- one row of \code{x} for each matching row of \code{y}, where a semi
- join will never duplicate rows of \code{x}.}
+A semi join differs from an inner join because an inner join will return
+one row of \code{x} for each matching row of \code{y}, where a semi
+join will never duplicate rows of \code{x}.}
- \item{\code{anti_join}}{return all rows from \code{x} where there are not
- matching values in \code{y}, keeping just columns from \code{x}.}
+\item{\code{anti_join()}}{return all rows from \code{x} where there are not
+matching values in \code{y}, keeping just columns from \code{x}.}
- \item{\code{full_join}}{return all rows and all columns from both \code{x} and \code{y}.
- Where there are not matching values, returns \code{NA} for the one missing.}
+\item{\code{full_join()}}{return all rows and all columns from both \code{x} and \code{y}.
+Where there are not matching values, returns \code{NA} for the one missing.}
}
}
@@ -94,3 +94,22 @@ Groups are ignored for the purpose of joining, but the result preserves
the grouping of \code{x}.
}
+\examples{
+# "Mutating" joins add variables to the LHS
+band_members \%>\% inner_join(band_instruments)
+band_members \%>\% left_join(band_instruments)
+band_members \%>\% right_join(band_instruments)
+band_members \%>\% full_join(band_instruments)
+
+# "Filtering" joins keep cases from the LHS
+band_members \%>\% semi_join(band_instruments)
+band_members \%>\% anti_join(band_instruments)
+
+# To suppress the message, supply by
+band_members \%>\% inner_join(band_instruments, by = "name")
+# This is good practice in production code
+
+# Use a named `by` if the join variables have different names
+band_members \%>\% full_join(band_instruments2, by = c("name" = "artist"))
+# Note that only the key from the LHS is kept
+}
diff --git a/man/join.tbl_df.Rd b/man/join.tbl_df.Rd
index 74380e9..758a1ab 100644
--- a/man/join.tbl_df.Rd
+++ b/man/join.tbl_df.Rd
@@ -1,30 +1,34 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/tbl-df.r
\name{join.tbl_df}
-\alias{anti_join.tbl_df}
-\alias{full_join.tbl_df}
-\alias{inner_join.tbl_df}
\alias{join.tbl_df}
+\alias{inner_join.tbl_df}
\alias{left_join.tbl_df}
\alias{right_join.tbl_df}
+\alias{full_join.tbl_df}
\alias{semi_join.tbl_df}
-\title{Join data frame tbls.}
+\alias{anti_join.tbl_df}
+\title{Join data frame tbls}
\usage{
\method{inner_join}{tbl_df}(x, y, by = NULL, copy = FALSE,
- suffix = c(".x", ".y"), ...)
+ suffix = c(".x", ".y"), ...,
+ na_matches = pkgconfig::get_config("dplyr::na_matches"))
\method{left_join}{tbl_df}(x, y, by = NULL, copy = FALSE, suffix = c(".x",
- ".y"), ...)
+ ".y"), ..., na_matches = pkgconfig::get_config("dplyr::na_matches"))
\method{right_join}{tbl_df}(x, y, by = NULL, copy = FALSE,
- suffix = c(".x", ".y"), ...)
+ suffix = c(".x", ".y"), ...,
+ na_matches = pkgconfig::get_config("dplyr::na_matches"))
\method{full_join}{tbl_df}(x, y, by = NULL, copy = FALSE, suffix = c(".x",
- ".y"), ...)
+ ".y"), ..., na_matches = pkgconfig::get_config("dplyr::na_matches"))
-\method{semi_join}{tbl_df}(x, y, by = NULL, copy = FALSE, ...)
+\method{semi_join}{tbl_df}(x, y, by = NULL, copy = FALSE, ...,
+ na_matches = pkgconfig::get_config("dplyr::na_matches"))
-\method{anti_join}{tbl_df}(x, y, by = NULL, copy = FALSE, ...)
+\method{anti_join}{tbl_df}(x, y, by = NULL, copy = FALSE, ...,
+ na_matches = pkgconfig::get_config("dplyr::na_matches"))
}
\arguments{
\item{x}{tbls to join}
@@ -32,14 +36,14 @@
\item{y}{tbls to join}
\item{by}{a character vector of variables to join by. If \code{NULL}, the
- default, \code{join} will do a natural join, using all variables with
- common names across the two tables. A message lists the variables so
- that you can check they're right (to suppress the message, simply
- explicitly list the variables that you want to join).
+default, \code{*_join()} will do a natural join, using all variables with
+common names across the two tables. A message lists the variables so
+that you can check they're right (to suppress the message, simply
+explicitly list the variables that you want to join).
- To join by different variables on x and y use a named vector.
- For example, \code{by = c("a" = "b")} will match \code{x.a} to
- \code{y.b}.}
+To join by different variables on x and y use a named vector.
+For example, \code{by = c("a" = "b")} will match \code{x.a} to
+\code{y.b}.}
\item{copy}{If \code{x} and \code{y} are not from the same data source,
and \code{copy} is \code{TRUE}, then \code{y} will be copied into the
@@ -47,12 +51,20 @@ same src as \code{x}. This allows you to join tables across srcs, but
it is a potentially expensive operation so you must opt into it.}
\item{suffix}{If there are non-joined duplicate variables in \code{x} and
-\code{y}, these suffixes will be added to the output to diambiguate them.}
+\code{y}, these suffixes will be added to the output to disambiguate them.
+Should be a character vector of length 2.}
\item{...}{included for compatibility with the generic; otherwise ignored.}
+
+\item{na_matches}{Use \code{"never"} to always treat two \code{NA} or \code{NaN} values as
+different, like joins for database sources, similarly to
+\code{merge(incomparables = FALSE)}.
+The default,\code{"na"}, always treats two \code{NA} or \code{NaN} values as equal, like \code{\link[=merge]{merge()}}.
+Users and package authors can change the default behavior by calling
+\code{pkgconfig::set_config("dplyr::na_matches" = "never")}.}
}
\description{
-See \code{\link{join}} for a description of the general purpose of the
+See \link{join} for a description of the general purpose of the
functions.
}
\examples{
@@ -75,4 +87,3 @@ anti_join(batting_df, person_df)
anti_join(person_df, batting_df)
}
}
-
diff --git a/man/join.tbl_sql.Rd b/man/join.tbl_sql.Rd
deleted file mode 100644
index d134dcd..0000000
--- a/man/join.tbl_sql.Rd
+++ /dev/null
@@ -1,131 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/tbl-sql.r
-\name{join.tbl_sql}
-\alias{anti_join.tbl_lazy}
-\alias{full_join.tbl_lazy}
-\alias{inner_join.tbl_lazy}
-\alias{join.tbl_sql}
-\alias{left_join.tbl_lazy}
-\alias{right_join.tbl_lazy}
-\alias{semi_join.tbl_lazy}
-\title{Join sql tbls.}
-\usage{
-\method{inner_join}{tbl_lazy}(x, y, by = NULL, copy = FALSE,
- suffix = c(".x", ".y"), auto_index = FALSE, ...)
-
-\method{left_join}{tbl_lazy}(x, y, by = NULL, copy = FALSE,
- suffix = c(".x", ".y"), auto_index = FALSE, ...)
-
-\method{right_join}{tbl_lazy}(x, y, by = NULL, copy = FALSE,
- suffix = c(".x", ".y"), auto_index = FALSE, ...)
-
-\method{full_join}{tbl_lazy}(x, y, by = NULL, copy = FALSE,
- suffix = c(".x", ".y"), auto_index = FALSE, ...)
-
-\method{semi_join}{tbl_lazy}(x, y, by = NULL, copy = FALSE,
- auto_index = FALSE, ...)
-
-\method{anti_join}{tbl_lazy}(x, y, by = NULL, copy = FALSE,
- auto_index = FALSE, ...)
-}
-\arguments{
-\item{x}{tbls to join}
-
-\item{y}{tbls to join}
-
-\item{by}{a character vector of variables to join by. If \code{NULL}, the
- default, \code{join} will do a natural join, using all variables with
- common names across the two tables. A message lists the variables so
- that you can check they're right (to suppress the message, simply
- explicitly list the variables that you want to join).
-
- To join by different variables on x and y use a named vector.
- For example, \code{by = c("a" = "b")} will match \code{x.a} to
- \code{y.b}.}
-
-\item{copy}{If \code{x} and \code{y} are not from the same data source,
- and \code{copy} is \code{TRUE}, then \code{y} will be copied into a
- temporary table in same database as \code{x}. \code{join} will automatically
- run \code{ANALYZE} on the created table in the hope that this will make
- you queries as efficient as possible by giving more data to the query
- planner.
-
- This allows you to join tables across srcs, but it's potentially expensive
- operation so you must opt into it.}
-
-\item{suffix}{If there are non-joined duplicate variables in \code{x} and
-\code{y}, these suffixes will be added to the output to diambiguate them.}
-
-\item{auto_index}{if \code{copy} is \code{TRUE}, automatically create
-indices for the variables in \code{by}. This may speed up the join if
-there are matching indexes in \code{x}.}
-
-\item{...}{other parameters passed onto methods}
-}
-\description{
-See \code{\link{join}} for a description of the general purpose of the
-functions.
-}
-\section{Implementation notes}{
-
-
-Semi-joins are implemented using \code{WHERE EXISTS}, and anti-joins with
-\code{WHERE NOT EXISTS}. Support for semi-joins is somewhat partial: you
-can only create semi joins where the \code{x} and \code{y} columns are
-compared with \code{=} not with more general operators.
-}
-\examples{
-\dontrun{
-if (require("RSQLite") && has_lahman("sqlite")) {
-
-# Left joins ----------------------------------------------------------------
-lahman_s <- lahman_sqlite()
-batting <- tbl(lahman_s, "Batting")
-team_info <- select(tbl(lahman_s, "Teams"), yearID, lgID, teamID, G, R:H)
-
-# Combine player and whole team statistics
-first_stint <- select(filter(batting, stint == 1), playerID:H)
-both <- left_join(first_stint, team_info, type = "inner", by = c("yearID", "teamID", "lgID"))
-head(both)
-explain(both)
-
-# Join with a local data frame
-grid <- expand.grid(
- teamID = c("WAS", "ATL", "PHI", "NYA"),
- yearID = 2010:2012)
-top4a <- left_join(batting, grid, copy = TRUE)
-explain(top4a)
-
-# Indices don't really help here because there's no matching index on
-# batting
-top4b <- left_join(batting, grid, copy = TRUE, auto_index = TRUE)
-explain(top4b)
-
-# Semi-joins ----------------------------------------------------------------
-
-people <- tbl(lahman_s, "Master")
-
-# All people in half of fame
-hof <- tbl(lahman_s, "HallOfFame")
-semi_join(people, hof)
-
-# All people not in the hall of fame
-anti_join(people, hof)
-
-# Find all managers
-manager <- tbl(lahman_s, "Managers")
-semi_join(people, manager)
-
-# Find all managers in hall of fame
-famous_manager <- semi_join(semi_join(people, manager), hof)
-famous_manager
-explain(famous_manager)
-
-# Anti-joins ----------------------------------------------------------------
-
-# batters without person covariates
-anti_join(batting, people)
-}
-}
-}
-
diff --git a/man/lahman.Rd b/man/lahman.Rd
deleted file mode 100644
index 69a5440..0000000
--- a/man/lahman.Rd
+++ /dev/null
@@ -1,67 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/data-lahman.r
-\name{lahman}
-\alias{copy_lahman}
-\alias{has_lahman}
-\alias{lahman}
-\alias{lahman_df}
-\alias{lahman_dt}
-\alias{lahman_mysql}
-\alias{lahman_postgres}
-\alias{lahman_sqlite}
-\alias{lahman_srcs}
-\title{Cache and retrieve an \code{src_sqlite} of the Lahman baseball database.}
-\usage{
-lahman_sqlite(path = NULL)
-
-lahman_postgres(dbname = "lahman", ...)
-
-lahman_mysql(dbname = "lahman", ...)
-
-lahman_df()
-
-lahman_dt()
-
-copy_lahman(src, ...)
-
-has_lahman(type, ...)
-
-lahman_srcs(..., quiet = NULL)
-}
-\arguments{
-\item{...}{Other arguments passed to \code{src} on first
-load. For mysql and postgresql, the defaults assume you have a local
-server with \code{lahman} database already created.
-For \code{lahman_srcs}, character vector of names giving srcs to generate.}
-
-\item{type}{src type.}
-
-\item{quiet}{if \code{TRUE}, suppress messages about databases failing to
-connect.}
-}
-\description{
-This creates an interesting database using data from the Lahman baseball
-data source, provided by Sean Lahman at
-\url{http://www.seanlahman.com/baseball-archive/statistics/}, and
-made easily available in R through the \pkg{Lahman} package by
-Michael Friendly, Dennis Murphy and Martin Monkman. See the documentation
-for that package for documentation of the inidividual tables.
-}
-\examples{
-# Connect to a local sqlite database, if already created
-\donttest{
-if (has_lahman("sqlite")) {
- lahman_sqlite()
- batting <- tbl(lahman_sqlite(), "Batting")
- batting
-}
-
-# Connect to a local postgres database with lahman database, if available
-if (has_lahman("postgres")) {
- lahman_postgres()
- batting <- tbl(lahman_postgres(), "Batting")
-}
-}
-}
-\keyword{internal}
-
diff --git a/man/lazy_ops.Rd b/man/lazy_ops.Rd
deleted file mode 100644
index 5228ecd..0000000
--- a/man/lazy_ops.Rd
+++ /dev/null
@@ -1,40 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/lazy-ops.R
-\name{lazy_ops}
-\alias{add_op_single}
-\alias{lazy_ops}
-\alias{op_base}
-\alias{op_double}
-\alias{op_grps}
-\alias{op_single}
-\alias{op_sort}
-\alias{op_vars}
-\title{Lazy operations}
-\usage{
-op_base(name, src, x, vars)
-
-op_single(name, x, dots = list(), args = list())
-
-add_op_single(name, .data, dots = list(), args = list())
-
-op_double(name, x, y, args = list())
-
-op_grps(op)
-
-op_vars(op)
-
-op_sort(op)
-}
-\description{
-This set of S3 classes describe the action of dplyr verbs. These are
-currently used for SQL sources to separate the description of operations
-in R from their computation in SQL. This API is very new so is likely
-to evolve in the future.
-}
-\details{
-\code{op_vars} and \code{op_grps} compute the variables and groups from
-a sequence of lazy operations. \code{op_sort} tracks the order of the
-data for use in window functions.
-}
-\keyword{internal}
-
diff --git a/man/lead-lag.Rd b/man/lead-lag.Rd
index 246f33c..0a63c8d 100644
--- a/man/lead-lag.Rd
+++ b/man/lead-lag.Rd
@@ -1,9 +1,9 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/lead-lag.R
\name{lead-lag}
-\alias{lag}
-\alias{lead}
\alias{lead-lag}
+\alias{lead}
+\alias{lag}
\title{Lead and lag.}
\usage{
lead(x, n = 1L, default = NA, order_by = NULL, ...)
@@ -13,18 +13,18 @@ lag(x, n = 1L, default = NA, order_by = NULL, ...)
\arguments{
\item{x}{a vector of values}
-\item{n}{a postive integer of length 1, giving the number of positions to
+\item{n}{a positive integer of length 1, giving the number of positions to
lead or lag by}
-\item{default}{value used for non-existant rows. Defaults to \code{NA}.}
+\item{default}{value used for non-existent rows. Defaults to \code{NA}.}
\item{order_by}{override the default ordering to use another vector}
\item{...}{Needed for compatibility with lag generic.}
}
\description{
-Lead and lag are useful for comparing values offset by a constant (e.g. the
-previous or next value)
+Find the "next" or "previous" values in a vector. Useful for comparing values
+ahead of or behind the current values.
}
\examples{
lead(1:10, 1)
@@ -46,4 +46,3 @@ arrange(wrong, year)
right <- mutate(scrambled, prev = lag(value, order_by = year))
arrange(right, year)
}
-
diff --git a/man/location.Rd b/man/location.Rd
index ac79855..ccd8f74 100644
--- a/man/location.Rd
+++ b/man/location.Rd
@@ -1,8 +1,8 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/location.R
\name{location}
-\alias{changes}
\alias{location}
+\alias{changes}
\title{Print the location in memory of a data frame}
\usage{
location(df)
@@ -27,4 +27,4 @@ location(mtcars2)
changes(mtcars, mtcars)
changes(mtcars, mtcars2)
}
-
+\keyword{internal}
diff --git a/man/make_tbl.Rd b/man/make_tbl.Rd
index 2c635fe..92a29e4 100644
--- a/man/make_tbl.Rd
+++ b/man/make_tbl.Rd
@@ -11,17 +11,16 @@ make_tbl(subclass, ...)
must supply this value. \code{tbl_} is automatically prepended to the
class name}
-\item{...}{For \code{tbl}, other fields used by class. For \code{as.tbl},
+\item{...}{For \code{tbl()}, other fields used by class. For \code{as.tbl()},
other arguments passed to methods.}
\item{object}{to test/coerce.}
}
\description{
-\code{tbl} is the standard constructor for tbls. \code{as.tbl} coerces,
-and \code{is.tbl} tests.
+\code{tbl()} is the standard constructor for tbls. \code{as.tbl()} coerces,
+and \code{is.tbl()} tests.
}
\examples{
as.tbl(mtcars)
}
\keyword{internal}
-
diff --git a/man/mutate.Rd b/man/mutate.Rd
index 39e4591..8e2de83 100644
--- a/man/mutate.Rd
+++ b/man/mutate.Rd
@@ -2,48 +2,104 @@
% Please edit documentation in R/manip.r
\name{mutate}
\alias{mutate}
-\alias{mutate_}
\alias{transmute}
-\alias{transmute_}
-\title{Add new variables.}
+\title{Add new variables}
\usage{
mutate(.data, ...)
-mutate_(.data, ..., .dots)
-
transmute(.data, ...)
-
-transmute_(.data, ..., .dots)
}
\arguments{
\item{.data}{A tbl. All main verbs are S3 generics and provide methods
-for \code{\link{tbl_df}}, \code{\link[dtplyr]{tbl_dt}} and \code{\link{tbl_sql}}.}
+for \code{\link[=tbl_df]{tbl_df()}}, \code{\link[dtplyr:tbl_dt]{dtplyr::tbl_dt()}} and \code{\link[dbplyr:tbl_dbi]{dbplyr::tbl_dbi()}}.}
\item{...}{Name-value pairs of expressions. Use \code{NULL} to drop
-a variable.}
+a variable.
-\item{.dots}{Used to work around non-standard evaluation. See
-\code{vignette("nse")} for details.}
+These arguments are automatically \link[rlang:quo]{quoted} and
+\link[rlang:eval_tidy]{evaluated} in the context of the data
+frame. They support \link[rlang:quasiquotation]{unquoting} and
+splicing. See \code{vignette("programming")} for an introduction to
+these concepts.}
}
\value{
An object of the same class as \code{.data}.
-
- Data frame row names are silently dropped. To preserve, convert to an
- explicit variable.
}
\description{
-Mutate adds new variables and preserves existing; transmute drops existing
-variables.
+\code{mutate()} adds new variables and preserves existing;
+\code{transmute()} drops existing variables.
+}
+\section{Useful functions}{
+
+\itemize{
+\item \code{\link{+}}, \code{\link{-}} etc
+\item \code{\link[=log]{log()}}
+\item \code{\link[=lead]{lead()}}, \code{\link[=lag]{lag()}}
+\item \code{\link[=dense_rank]{dense_rank()}}, \code{\link[=min_rank]{min_rank()}}, \code{\link[=percent_rank]{percent_rank()}}, \code{\link[=row_number]{row_number()}},
+\code{\link[=cume_dist]{cume_dist()}}, \code{\link[=ntile]{ntile()}}
+\item \code{\link[=cumsum]{cumsum()}}, \code{\link[=cummean]{cummean()}}, \code{\link[=cummin]{cummin()}}, \code{\link[=cummax]{cummax()}}, \code{\link[=cumany]{cumany()}}, \code{\link[=cumall]{cumall()}}
+\item \code{\link[=na_if]{na_if()}}, \code{\link[=coalesce]{coalesce()}}
+\item \code{\link[=if_else]{if_else()}}, \code{\link[=recode]{recode()}}, \code{\link[=case_when]{case_when()}}
+}
+}
+
+\section{Scoped mutation and transmutation}{
+
+
+The three \link{scoped} variants of \code{mutate()} (\code{\link[=mutate_all]{mutate_all()}},
+\code{\link[=mutate_if]{mutate_if()}} and \code{\link[=mutate_at]{mutate_at()}}) and the three variants of
+\code{transmute()} (\code{\link[=transmute_all]{transmute_all()}}, \code{\link[=transmute_if]{transmute_if()}},
+\code{\link[=transmute_at]{transmute_at()}}) make it easy to apply a transformation to a
+selection of variables.
+}
+
+\section{Tidy data}{
+
+When applied to a data frame, row names are silently dropped. To preserve,
+convert to an explicit variable with \code{\link[tibble:rownames_to_column]{tibble::rownames_to_column()}}.
}
+
\examples{
-mutate(mtcars, displ_l = disp / 61.0237)
-transmute(mtcars, displ_l = disp / 61.0237)
+# Newly created variables are available immediately
+mtcars \%>\% as_tibble() \%>\% mutate(
+ cyl2 = cyl * 2,
+ cyl4 = cyl2 * 2
+)
+
+# You can also use mutate() to remove variables and
+# modify existing variables
+mtcars \%>\% as_tibble() \%>\% mutate(
+ mpg = NULL,
+ disp = disp * 0.0163871 # convert to litres
+)
+
+
+# window functions are useful for grouped mutates
+mtcars \%>\%
+ group_by(cyl) \%>\%
+ mutate(rank = min_rank(desc(mpg)))
+# see `vignette("window-functions")` for more details
-mutate(mtcars, cyl = NULL)
+# You can drop variables by setting them to NULL
+mtcars \%>\% mutate(cyl = NULL)
+
+# mutate() vs transmute --------------------------
+# mutate() keeps all existing variables
+mtcars \%>\%
+ mutate(displ_l = disp / 61.0237)
+
+# transmute keeps only the variables you create
+mtcars \%>\%
+ transmute(displ_l = disp / 61.0237)
+
+
+# mutate() supports quasiquotation. You can unquote quosures, which
+# can refer to both contextual variables and variable names:
+var <- 100
+as_tibble(mtcars) \%>\% mutate(cyl = !! quo(cyl * var))
}
\seealso{
-Other single.table.verbs: \code{\link{arrange}},
+Other single table verbs: \code{\link{arrange}},
\code{\link{filter}}, \code{\link{select}},
\code{\link{slice}}, \code{\link{summarise}}
}
-
diff --git a/man/n.Rd b/man/n.Rd
index f47b7c7..b98905e 100644
--- a/man/n.Rd
+++ b/man/n.Rd
@@ -7,9 +7,9 @@
n()
}
\description{
-This function is implemented special for each data source and can only
-be used from within \code{\link{summarise}}, \code{\link{mutate}} and
-\code{\link{filter}}
+This function is implemented specifically for each data source and can only
+be used from within \code{\link[=summarise]{summarise()}}, \code{\link[=mutate]{mutate()}} and
+\code{\link[=filter]{filter()}}.
}
\examples{
if (require("nycflights13")) {
@@ -19,4 +19,3 @@ mutate(carriers, n = n())
filter(carriers, n() < 100)
}
}
-
diff --git a/man/n_distinct.Rd b/man/n_distinct.Rd
index b4266f5..4ed7f4b 100644
--- a/man/n_distinct.Rd
+++ b/man/n_distinct.Rd
@@ -7,9 +7,9 @@
n_distinct(..., na.rm = FALSE)
}
\arguments{
-\item{na.rm}{id \code{TRUE} missing values don't count}
-
\item{\dots}{vectors of values}
+
+\item{na.rm}{if \code{TRUE} missing values don't count}
}
\description{
This is a faster and more concise equivalent of \code{length(unique(x))}
@@ -19,4 +19,3 @@ x <- sample(1:10, 1e5, rep = TRUE)
length(unique(x))
n_distinct(x)
}
-
diff --git a/man/na_if.Rd b/man/na_if.Rd
index ec05718..c7073f3 100644
--- a/man/na_if.Rd
+++ b/man/na_if.Rd
@@ -2,18 +2,18 @@
% Please edit documentation in R/na_if.R
\name{na_if}
\alias{na_if}
-\title{Convert values to NA.}
+\title{Convert values to NA}
\usage{
na_if(x, y)
}
\arguments{
\item{x}{Vector to modify}
-\item{y}{If th}
+\item{y}{Value to replace with NA}
}
\value{
A modified version of \code{x} that replaces any values that
- are equal to \code{y} with NA.
+are equal to \code{y} with NA.
}
\description{
This is a translation of the SQL command \code{NULL_IF}. It is useful
@@ -30,7 +30,6 @@ y <- c("abc", "def", "", "ghi")
na_if(y, "")
}
\seealso{
-\code{\link{coalesce}()} to replace missing values with a specified
- value.
+\code{\link[=coalesce]{coalesce()}} to replace missing values with a specified
+value.
}
-
diff --git a/man/named_commas.Rd b/man/named_commas.Rd
deleted file mode 100644
index dc6d028..0000000
--- a/man/named_commas.Rd
+++ /dev/null
@@ -1,16 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/utils.r
-\name{named_commas}
-\alias{named_commas}
-\title{Provides comma-separated string out ot the parameters}
-\usage{
-named_commas(...)
-}
-\arguments{
-\item{...}{Arguments to be constructed into the string}
-}
-\description{
-Provides comma-separated string out ot the parameters
-}
-\keyword{internal}
-
diff --git a/man/nasa.Rd b/man/nasa.Rd
index 54ce3d3..1936768 100644
--- a/man/nasa.Rd
+++ b/man/nasa.Rd
@@ -4,7 +4,7 @@
\name{nasa}
\alias{nasa}
\title{NASA spatio-temporal data}
-\format{A \code{\link{tbl_cube}} with 41,472 observations.}
+\format{A \link{tbl_cube} with 41,472 observations.}
\usage{
nasa
}
@@ -22,8 +22,8 @@ Data Center (with permission; see important copyright terms below).
\itemize{
- \item \code{lat}, \code{long}: latitude and longitude
- \item \code{year}, \code{month}: month and year
+\item \code{lat}, \code{long}: latitude and longitude
+\item \code{year}, \code{month}: month and year
}
}
@@ -31,14 +31,14 @@ Data Center (with permission; see important copyright terms below).
\itemize{
- \item \code{cloudlow}, \code{cloudmed}, \code{cloudhigh}: cloud cover
- at three heights
- \item \code{ozone}
- \item \code{surftemp} and \code{temperature}
- \item \code{pressure}
+\item \code{cloudlow}, \code{cloudmed}, \code{cloudhigh}: cloud cover
+at three heights
+\item \code{ozone}
+\item \code{surftemp} and \code{temperature}
+\item \code{pressure}
}
}
+
\examples{
nasa
}
-
diff --git a/man/near.Rd b/man/near.Rd
index 96459d5..ac515f7 100644
--- a/man/near.Rd
+++ b/man/near.Rd
@@ -2,7 +2,7 @@
% Please edit documentation in R/near.R
\name{near}
\alias{near}
-\title{Compare two numeric vectors.}
+\title{Compare two numeric vectors}
\usage{
near(x, y, tol = .Machine$double.eps^0.5)
}
@@ -20,4 +20,3 @@ a built in tolerance
sqrt(2) ^ 2 == 2
near(sqrt(2) ^ 2, 2)
}
-
diff --git a/man/nth.Rd b/man/nth.Rd
index ac756fb..cb3c610 100644
--- a/man/nth.Rd
+++ b/man/nth.Rd
@@ -1,10 +1,10 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/nth-value.R
\name{nth}
+\alias{nth}
\alias{first}
\alias{last}
-\alias{nth}
-\title{Extract the first, last or nth value from a vector.}
+\title{Extract the first, last or nth value from a vector}
\usage{
nth(x, n, order_by = NULL, default = default_missing(x))
@@ -15,19 +15,21 @@ last(x, order_by = NULL, default = default_missing(x))
\arguments{
\item{x}{A vector}
-\item{n}{For \code{nth_value}, a single integer specifying the position.
- Negative integers index from the end (i.e. \code{-1L} will return the
- last value in the vector).
+\item{n}{For \code{nth_value()}, a single integer specifying the position.
+Negative integers index from the end (i.e. \code{-1L} will return the
+last value in the vector).
- If a double is supplied, it will be silently truncated.}
+If a double is supplied, it will be silently truncated.}
\item{order_by}{An optional vector used to determine the order}
\item{default}{A default value to use if the position does not exist in
-the input. This is guessed by default for atomic vectors, where a
-missing value of the appropriate type is return, and for lists, where
-a \code{NULL} is return. For more complicated objects, you'll need to
-supply this value.}
+the input. This is guessed by default for base vectors, where a
+missing value of the appropriate type is returned, and for lists, where
+a \code{NULL} is return.
+
+For more complicated objects, you'll need to supply this value.
+Make sure it is the same type as \code{x}.}
}
\value{
A single value. \code{[[} is used to do the subsetting.
@@ -42,12 +44,18 @@ than expected.
x <- 1:10
y <- 10:1
+first(x)
+last(y)
+
nth(x, 1)
nth(x, 5)
nth(x, -2)
nth(x, 11)
last(x)
+# Second argument provides optional ordering
last(x, y)
-}
+# These functions always return a single value
+first(integer())
+}
diff --git a/man/nycflights13.Rd b/man/nycflights13.Rd
deleted file mode 100644
index 6603add..0000000
--- a/man/nycflights13.Rd
+++ /dev/null
@@ -1,30 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/data-nycflights13.r
-\name{nycflights13}
-\alias{copy_nycflights13}
-\alias{has_nycflights13}
-\alias{nycflights13}
-\alias{nycflights13_postgres}
-\alias{nycflights13_sqlite}
-\title{Database versions of the nycflights13 data}
-\usage{
-nycflights13_sqlite(path = NULL)
-
-nycflights13_postgres(dbname = "nycflights13", ...)
-
-has_nycflights13(type = c("sqlite", "postgresql"), ...)
-
-copy_nycflights13(src, ...)
-}
-\arguments{
-\item{path}{location of sqlite database file}
-
-\item{dbname, ...}{Arguments passed on to \code{\link{src_postgres}}}
-}
-\description{
-These functions cache the data from the \code{nycflights13} database in
-a local database, for use in examples and vignettes. Indexes are created
-to making joining tables on natural keys efficient.
-}
-\keyword{internal}
-
diff --git a/man/order_by.Rd b/man/order_by.Rd
index 508b602..0aea5d9 100644
--- a/man/order_by.Rd
+++ b/man/order_by.Rd
@@ -2,7 +2,7 @@
% Please edit documentation in R/order-by.R
\name{order_by}
\alias{order_by}
-\title{A helper function for ordering window function output.}
+\title{A helper function for ordering window function output}
\usage{
order_by(order_by, call)
}
@@ -13,13 +13,13 @@ order_by(order_by, call)
is the vector being operated on}
}
\description{
-This is a useful function to control the order of window functions in
-R that don't have a specific ordering parameter. When translated to SQL
+This function makes it possible to control the ordering of window functions
+in R that don't have a specific ordering parameter. When translated to SQL
it will modify the order clause of the OVER function.
}
\details{
This function works by changing the \code{call} to instead call
-\code{\link{with_order}} with the appropriate arguments.
+\code{\link[=with_order]{with_order()}} with the appropriate arguments.
}
\examples{
order_by(10:1, cumsum(1:10))
@@ -36,4 +36,3 @@ arrange(wrong, year)
right <- mutate(scrambled, running = order_by(year, cumsum(value)))
arrange(right, year)
}
-
diff --git a/man/partial_eval.Rd b/man/partial_eval.Rd
deleted file mode 100644
index ac6af6c..0000000
--- a/man/partial_eval.Rd
+++ /dev/null
@@ -1,64 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/partial-eval.r
-\name{partial_eval}
-\alias{partial_eval}
-\title{Partially evaluate an expression.}
-\usage{
-partial_eval(call, tbl = NULL, env = parent.frame())
-}
-\arguments{
-\item{call}{an unevaluated expression, as produced by \code{\link{quote}}}
-
-\item{tbl}{a tbl object}
-
-\item{env}{environment in which to search for local values}
-}
-\description{
-This function partially evaluates an expression, using information from
-the tbl to determine whether names refer to local expressions
-or remote variables. This simplifies SQL translation because expressions
-don't need to carry around their environment - all revelant information
-is incorporated into the expression.
-}
-\section{Symbol substitution}{
-
-
-\code{partial_eval} needs to guess if you're referring to a variable on the
-server (remote), or in the current environment (local). It's not possible to
-do this 100% perfectly. \code{partial_eval} uses the following heuristic:
-
-\itemize{
- \item If the tbl variables are known, and the symbol matches a tbl
- variable, then remote.
- \item If the symbol is defined locally, local.
- \item Otherwise, remote.
-}
-}
-\examples{
-if (require("Lahman")) {
-bdf <- tbl_df(Batting)
-partial_eval(quote(year > 1980), bdf)
-
-ids <- c("ansonca01", "forceda01", "mathebo01")
-partial_eval(quote(id \%in\% ids), bdf)
-
-# You can use local to disambiguate between local and remote
-# variables: otherwise remote is always preferred
-year <- 1980
-partial_eval(quote(year > year), bdf)
-partial_eval(quote(year > local(year)), bdf)
-
-# Functions are always assumed to be remote. Use local to force evaluation
-# in R.
-f <- function(x) x + 1
-partial_eval(quote(year > f(1980)), bdf)
-partial_eval(quote(year > local(f(1980))), bdf)
-
-# For testing you can also use it with the tbl omitted
-partial_eval(quote(1 + 2 * 3))
-x <- 1
-partial_eval(quote(x ^ y))
-}
-}
-\keyword{internal}
-
diff --git a/man/progress_estimated.Rd b/man/progress_estimated.Rd
index 3e68310..ec8fd92 100644
--- a/man/progress_estimated.Rd
+++ b/man/progress_estimated.Rd
@@ -7,14 +7,14 @@
progress_estimated(n, min_time = 0)
}
\arguments{
-\item{n}{Total number of}
+\item{n}{Total number of items}
\item{min_time}{Progress bar will wait until at least \code{min_time}
seconds have elapsed before displaying any results.}
}
\value{
A ref class with methods \code{tick()}, \code{print()},
- \code{pause()}, and \code{stop()}.
+\code{pause()}, and \code{stop()}.
}
\description{
This reference class represents a text progress bar displayed estimated
@@ -46,4 +46,3 @@ for (i in 1:10) p$pause(0.5)$tick()$print()
}
}
\keyword{internal}
-
diff --git a/man/pull.Rd b/man/pull.Rd
new file mode 100644
index 0000000..905758f
--- /dev/null
+++ b/man/pull.Rd
@@ -0,0 +1,43 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/pull.R
+\name{pull}
+\alias{pull}
+\title{Pull out a single variable}
+\usage{
+pull(.data, var = -1)
+}
+\arguments{
+\item{.data}{A table of data}
+
+\item{var}{A variable specified as:
+\itemize{
+\item a literal variable name
+\item a positive integer, giving the position counting from the left
+\item a negative integer, giving the position counting from the right.
+}
+
+The default returns the last column (on the assumption that's the
+column you've created most recently).
+
+This argument is taken by expression and supports
+\link[rlang:quasiquotation]{quasiquotation} (you can unquote column
+names and column positions).}
+}
+\description{
+This works like \code{[[} for local data frames, and automatically collects
+before indexing for remote data tables.
+}
+\examples{
+mtcars \%>\% pull(-1)
+mtcars \%>\% pull(1)
+mtcars \%>\% pull(cyl)
+
+# Also works for remote sources
+if (requireNamespace("dbplyr", quietly = TRUE)) {
+df <- dbplyr::memdb_frame(x = 1:10, y = 10:1, .name = "pull-ex")
+df \%>\%
+ mutate(z = x * y) \%>\%
+ pull()
+}
+
+}
diff --git a/man/query.Rd b/man/query.Rd
deleted file mode 100644
index dea8547..0000000
--- a/man/query.Rd
+++ /dev/null
@@ -1,20 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/query.r
-\name{query}
-\alias{query}
-\title{Create a mutable query object.}
-\usage{
-query(con, sql, .vars)
-}
-\arguments{
-\item{con}{a \code{DBOConnection}}
-
-\item{sql}{a string containing an sql query.}
-}
-\description{
-A query object is mutable wrapper around a \code{DBIResult} that caches
-expensive operations, and insulates the rest of dplyr from the vagaries of
-DBI and the individual database implementation.
-}
-\keyword{internal}
-
diff --git a/man/ranking.Rd b/man/ranking.Rd
index 7cd758b..e56a2ea 100644
--- a/man/ranking.Rd
+++ b/man/ranking.Rd
@@ -1,13 +1,13 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/rank.R
\name{ranking}
-\alias{cume_dist}
-\alias{dense_rank}
-\alias{min_rank}
-\alias{ntile}
-\alias{percent_rank}
\alias{ranking}
\alias{row_number}
+\alias{ntile}
+\alias{min_rank}
+\alias{dense_rank}
+\alias{percent_rank}
+\alias{cume_dist}
\title{Windowed rank functions.}
\usage{
row_number(x)
@@ -30,30 +30,24 @@ with Inf or -Inf before ranking.}
\item{n}{number of groups to split up into.}
}
\description{
-Six variations on ranking functions, mimicing the ranking functions
+Six variations on ranking functions, mimicking the ranking functions
described in SQL2003. They are currently implemented using the built in
\code{rank} function, and are provided mainly as a convenience when
converting between R and SQL. All ranking functions map smallest inputs
-to smallest outputs. Use \code{\link{desc}} to reverse the direction..
+to smallest outputs. Use \code{\link[=desc]{desc()}} to reverse the direction.
}
\details{
\itemize{
-\item \code{row_number}: equivalent to \code{rank(ties.method = "first")}
-
-\item \code{min_rank}: equivalent to \code{rank(ties.method = "min")}
-
-\item \code{dense_rank}: like \code{min_rank}, but with no gaps between
- ranks
-
-\item \code{percent_rank}: a number between 0 and 1 computed by
- rescaling \code{min_rank} to [0, 1]
-
-\item \code{cume_dist}: a cumulative distribution function. Proportion
- of all values less than or equal to the current rank.
-
-\item \code{ntile}: a rough rank, which breaks the input vector into
- \code{n} buckets.
-
+\item \code{row_number()}: equivalent to \code{rank(ties.method = "first")}
+\item \code{min_rank()}: equivalent to \code{rank(ties.method = "min")}
+\item \code{dense_rank()}: like \code{min_rank()}, but with no gaps between
+ranks
+\item \code{percent_rank()}: a number between 0 and 1 computed by
+rescaling \code{min_rank} to \code{[0, 1]}
+\item \code{cume_dist()}: a cumulative distribution function. Proportion
+of all values less than or equal to the current rank.
+\item \code{ntile()}: a rough rank, which breaks the input vector into
+\code{n} buckets.
}
}
\examples{
@@ -66,5 +60,9 @@ cume_dist(x)
ntile(x, 2)
ntile(runif(100), 10)
-}
+# row_number can be used with single table verbs without specifying x
+# (for data frames and databases that support windowing)
+mutate(mtcars, row_number() == 1L)
+mtcars \%>\% filter(between(row_number(), 1, 10))
+}
diff --git a/man/recode.Rd b/man/recode.Rd
index fc1bd6f..b75218e 100644
--- a/man/recode.Rd
+++ b/man/recode.Rd
@@ -12,17 +12,22 @@ recode_factor(.x, ..., .default = NULL, .missing = NULL, .ordered = FALSE)
\arguments{
\item{.x}{A vector to modify}
-\item{...}{Replacments. These should be named for character and factor
- \code{.x}, and can be named for numeric \code{.x}.
+\item{...}{Replacements. These should be named for character and factor
+\code{.x}, and can be named for numeric \code{.x}. The argument names should be the
+current values to be replaced, and the argument values should be the new
+(replacement) values.
- All replacements must be the same type, and must have either
- length one or the same length as x.}
+All replacements must be the same type, and must have either
+length one or the same length as x.
+
+These dots are evaluated with \link[rlang:dots_list]{explicit splicing}.}
\item{.default}{If supplied, all values not otherwise matched will
be given this value. If not supplied and if the replacements are
the same type as the original values in \code{.x}, unmatched
values are not changed. If not supplied and if the replacements
are not compatible, unmatched values are replaced with \code{NA}.
+
\code{.default} must be either length 1 or the same length as
\code{.x}.}
@@ -35,15 +40,23 @@ ordered factor.}
}
\value{
A vector the same length as \code{.x}, and the same type as
- the first of \code{...}, \code{.default}, or \code{.missing}.
- \code{recode_factor()} returns a factor whose levels are in the
- same order as in \code{...}.
+the first of \code{...}, \code{.default}, or \code{.missing}.
+\code{recode_factor()} returns a factor whose levels are in the
+same order as in \code{...}.
}
\description{
-This is a vectorised version of \code{\link{switch}()}: you can replace
+This is a vectorised version of \code{\link[=switch]{switch()}}: you can replace
numeric values based on their position, and character values by their
name. This is an S3 generic: dplyr provides methods for numeric, character,
-and factors. For logical vectors, use \code{\link{if_else}}
+and factors. For logical vectors, use \code{\link[=if_else]{if_else()}}. For more complicated
+criteria, use \code{\link[=case_when]{case_when()}}.
+}
+\details{
+You can use \code{recode()} directly with factors; it will preserve the existing
+order of levels while changing the values. Alternatively, you can
+use \code{recode_factor()}, which will change the order of levels to match
+the order of replacements. See the \href{http://forcats.tidyverse.org/}{forcats}
+package for more tools for working with factors and their levels.
}
\examples{
# Recode values with named arguments
@@ -81,4 +94,3 @@ recode_factor(x, `1` = "z", `2` = "y", .default = "D", .missing = "M")
recode_factor(letters[1:3], b = "z", c = "y")
recode_factor(factor(letters[1:3]), b = "z", c = "y")
}
-
diff --git a/man/reexports.Rd b/man/reexports.Rd
index b26d689..36556c2 100644
--- a/man/reexports.Rd
+++ b/man/reexports.Rd
@@ -1,21 +1,48 @@
% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/tibble-reexport.r, R/utils.r
+% Please edit documentation in R/reexport-rlang.R, R/reexport-tibble.r,
+% R/utils.r
\docType{import}
\name{reexports}
-\alias{\%>\%}
-\alias{add_row}
-\alias{as_data_frame}
+\alias{reexports}
+\alias{quo}
+\alias{reexports}
+\alias{quos}
+\alias{reexports}
+\alias{enquo}
+\alias{reexports}
+\alias{quo_name}
+\alias{reexports}
\alias{data_frame}
+\alias{reexports}
\alias{data_frame_}
-\alias{frame_data}
-\alias{glimpse}
+\alias{reexports}
+\alias{as_data_frame}
+\alias{reexports}
\alias{lst}
+\alias{reexports}
\alias{lst_}
\alias{reexports}
+\alias{add_row}
+\alias{reexports}
+\alias{type_sum}
+\alias{reexports}
+\alias{glimpse}
+\alias{reexports}
+\alias{frame_data}
+\alias{reexports}
+\alias{tribble}
+\alias{reexports}
\alias{tibble}
+\alias{reexports}
+\alias{as_tibble}
+\alias{reexports}
\alias{trunc_mat}
-\alias{type_sum}
+\alias{reexports}
+\alias{tbl_sum}
+\alias{reexports}
+\alias{\%>\%}
\title{Objects exported from other packages}
+\keyword{internal}
\description{
These objects are imported from other packages. Follow the links
below to see their documentation.
@@ -23,7 +50,8 @@ below to see their documentation.
\describe{
\item{magrittr}{\code{\link[magrittr]{\%>\%}}}
- \item{tibble}{\code{\link[tibble]{data_frame}}, \code{\link[tibble]{data_frame_}}, \code{\link[tibble]{as_data_frame}}, \code{\link[tibble]{lst}}, \code{\link[tibble]{lst_}}, \code{\link[tibble]{add_row}}, \code{\link[tibble]{type_sum}}, \code{\link[tibble]{glimpse}}, \code{\link[tibble]{frame_data}}, \code{\link[tibble]{tibble}}, \code{\link[tibble]{trunc_mat}}}
+ \item{rlang}{\code{\link[rlang]{quo}}, \code{\link[rlang]{quos}}, \code{\link[rlang]{enquo}}, \code{\link[rlang]{quo_name}}}
+
+ \item{tibble}{\code{\link[tibble]{data_frame}}, \code{\link[tibble]{data_frame_}}, \code{\link[tibble]{as_data_frame}}, \code{\link[tibble]{lst}}, \code{\link[tibble]{lst_}}, \code{\link[tibble]{add_row}}, \code{\link[tibble]{type_sum}}, \code{\link[tibble]{glimpse}}, \code{\link[tibble]{frame_data}}, \code{\link[tibble]{tribble}}, \code{\link[tibble]{tibble}}, \code{\link[tibble]{as_tibble}}, \code{\link[tibble]{trunc_mat}}, \code{\link[tibble]{tbl_sum}}}
}}
-\keyword{internal}
diff --git a/man/rowwise.Rd b/man/rowwise.Rd
index 084799e..32bfe03 100644
--- a/man/rowwise.Rd
+++ b/man/rowwise.Rd
@@ -10,20 +10,19 @@ rowwise(data)
\item{data}{Input data frame.}
}
\description{
-\code{rowwise} is used for the results of \code{\link{do}} when you
+\code{rowwise()} is used for the results of \code{\link[=do]{do()}} when you
create list-variables. It is also useful to support arbitrary
complex operations that need to be applied to each row.
}
\details{
-Currently \code{rowwise} grouping only works with data frames. Its
+Currently, rowwise grouping only works with data frames. Its
main impact is to allow you to work with list-variables in
-\code{\link{summarise}} and \code{\link{mutate}} without having to
+\code{\link[=summarise]{summarise()}} and \code{\link[=mutate]{mutate()}} without having to
use \code{[[1]]}. This makes \code{summarise()} on a rowwise tbl
-effectively equivalent to plyr's \code{ldply}.
+effectively equivalent to \code{\link[plyr:ldply]{plyr::ldply()}}.
}
\examples{
df <- expand.grid(x = 1:3, y = 3:1)
df \%>\% rowwise() \%>\% do(i = seq(.$x, .$y))
.Last.value \%>\% summarise(n = length(i))
}
-
diff --git a/man/same_src.Rd b/man/same_src.Rd
index a40a703..e988ee8 100644
--- a/man/same_src.Rd
+++ b/man/same_src.Rd
@@ -16,4 +16,3 @@ a logical flag
Figure out if two sources are the same (or two tbl have the same source)
}
\keyword{internal}
-
diff --git a/man/sample.Rd b/man/sample.Rd
index 5784833..5e700bc 100644
--- a/man/sample.Rd
+++ b/man/sample.Rd
@@ -2,34 +2,38 @@
% Please edit documentation in R/sample.R
\name{sample}
\alias{sample}
-\alias{sample_frac}
\alias{sample_n}
-\title{Sample n rows from a table.}
+\alias{sample_frac}
+\title{Sample n rows from a table}
\usage{
-sample_n(tbl, size, replace = FALSE, weight = NULL, .env = parent.frame())
+sample_n(tbl, size, replace = FALSE, weight = NULL, .env = NULL)
-sample_frac(tbl, size = 1, replace = FALSE, weight = NULL,
- .env = parent.frame())
+sample_frac(tbl, size = 1, replace = FALSE, weight = NULL, .env = NULL)
}
\arguments{
\item{tbl}{tbl of data.}
-\item{size}{For \code{sample_n}, the number of rows to select.
-For \code{sample_frac}, the fraction of rows to select.
+\item{size}{For \code{sample_n()}, the number of rows to select.
+For \code{sample_frac()}, the fraction of rows to select.
If \code{tbl} is grouped, \code{size} applies to each group.}
\item{replace}{Sample with or without replacement?}
-\item{weight}{Sampling weights. This expression is evaluated in the
-context of the data frame. It must return a vector of non-negative
-numbers the same length as the input. Weights are automatically
-standardised to sum to 1.}
+\item{weight}{Sampling weights. This must evaluate to a vector of
+non-negative numbers the same length as the input. Weights are
+automatically standardised to sum to 1.
-\item{.env}{Environment in which to look for non-data names used in
-\code{weight}. Non-default settings for experts only.}
+This argument is automatically \link[rlang:quo]{quoted} and later
+\link[rlang:eval_tidy]{evaluated} in the context of the data
+frame. It supports \link[rlang:quasiquotation]{unquoting}. See
+\code{vignette("programming")} for an introduction to these concepts.}
+
+\item{.env}{This variable is deprecated and no longer has any
+effect. To evaluate \code{weight} in a particular context, you can
+now unquote a \link[rlang:quosure]{quosure}.}
}
\description{
-This is a wrapper around \code{\link{sample.int}} to make it easy to
+This is a wrapper around \code{\link[=sample.int]{sample.int()}} to make it easy to
select random rows from a table. It currently only works for local
tbls.
}
@@ -56,4 +60,3 @@ sample_frac(mtcars, 0.1, weight = 1 / mpg)
sample_frac(by_cyl, 0.2)
sample_frac(by_cyl, 1, replace = TRUE)
}
-
diff --git a/man/scoped.Rd b/man/scoped.Rd
new file mode 100644
index 0000000..1dde4a8
--- /dev/null
+++ b/man/scoped.Rd
@@ -0,0 +1,63 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/colwise.R
+\name{scoped}
+\alias{scoped}
+\title{Operate on a selection of variables}
+\arguments{
+\item{.tbl}{A \code{tbl} object.}
+
+\item{.funs}{List of function calls generated by \code{\link[=funs]{funs()}}, or a
+character vector of function names, or simply a function.
+
+Bare formulas are passed to \code{\link[rlang:as_function]{rlang::as_function()}} to create
+purrr-style lambda functions. Note that these lambda prevent
+hybrid evaluation from happening and it is thus more efficient to
+supply functions like \code{mean()} directly rather than in a
+lambda-formula.}
+
+\item{.vars}{A list of columns generated by \code{\link[=vars]{vars()}},
+or a character vector of column names, or a numeric vector of column
+positions.}
+
+\item{.predicate}{A predicate function to be applied to the columns
+or a logical vector. The variables for which \code{.predicate} is or
+returns \code{TRUE} are selected. This argument is passed to
+\code{\link[rlang:as_function]{rlang::as_function()}} and thus supports quosure-style lambda
+functions and strings representing function names.}
+
+\item{...}{Additional arguments for the function calls in
+\code{.funs}. These are evaluated only once, with \link[rlang:dots_list]{explicit
+splicing}.}
+}
+\description{
+The variants suffixed with \code{_if}, \code{_at} or \code{_all} apply an
+expression (sometimes several) to all variables within a specified
+subset. This subset can contain all variables (\code{_all} variants), a
+\code{\link[=vars]{vars()}} selection (\code{_at} variants), or variables selected with a
+predicate (\code{_if} variants).
+}
+\details{
+The verbs with scoped variants are:
+\itemize{
+\item \code{\link[=mutate]{mutate()}}, \code{\link[=transmute]{transmute()}} and \code{\link[=summarise]{summarise()}}. See \code{\link[=summarise_all]{summarise_all()}}.
+\item \code{\link[=filter]{filter()}}. See \code{\link[=filter_all]{filter_all()}}.
+\item \code{\link[=group_by]{group_by()}}. See \code{\link[=group_by_all]{group_by_all()}}.
+\item \code{\link[=rename]{rename()}} and \code{\link[=select]{select()}}. See \code{\link[=select_all]{select_all()}}.
+\item \code{\link[=arrange]{arrange()}}. See \code{\link[=arrange_all]{arrange_all()}}
+}
+
+There are three kinds of scoped variants. They differ in the scope
+of the variable selection on which operations are applied:
+\itemize{
+\item Verbs suffixed with \code{_all()} apply an operation on all variables.
+\item Verbs suffixed with \code{_at()} apply an operation on a subset of
+variables specified with the quoting function \code{\link[=vars]{vars()}}. This
+quoting function accepts \code{\link[=select_vars]{select_vars()}} helpers like
+\code{\link[=starts_with]{starts_with()}}. Instead of a \code{\link[=vars]{vars()}} selection, you can also
+supply an \link[rlang:is_integerish]{integerish} vector of column
+positions or a character vector of column names.
+\item Verbs suffixed with \code{_if()} apply an operation on the subset of
+variables for which a predicate function returns \code{TRUE}. Instead
+of a predicate function, you can also supply a logical vector.
+}
+}
diff --git a/man/se-deprecated.Rd b/man/se-deprecated.Rd
new file mode 100644
index 0000000..83d9441
--- /dev/null
+++ b/man/se-deprecated.Rd
@@ -0,0 +1,121 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/count-tally.R, R/distinct.R, R/do.r, R/funs.R,
+% R/group-by.r, R/group-indices.R, R/manip.r, R/select-vars.R
+\name{tally_}
+\alias{tally_}
+\alias{count_}
+\alias{add_tally_}
+\alias{add_count_}
+\alias{distinct_}
+\alias{do_}
+\alias{funs_}
+\alias{group_by_}
+\alias{group_indices_}
+\alias{filter_}
+\alias{slice_}
+\alias{summarise_}
+\alias{summarize_}
+\alias{mutate_}
+\alias{transmute_}
+\alias{arrange_}
+\alias{select_}
+\alias{rename_}
+\alias{se-deprecated}
+\alias{select_vars_}
+\alias{rename_vars_}
+\title{Deprecated SE versions of main verbs.}
+\usage{
+tally_(x, wt, sort = FALSE)
+
+count_(x, vars, wt = NULL, sort = FALSE)
+
+add_tally_(x, wt, sort = FALSE)
+
+add_count_(x, vars, wt = NULL, sort = FALSE)
+
+distinct_(.data, ..., .dots, .keep_all = FALSE)
+
+do_(.data, ..., .dots = list())
+
+funs_(dots, args = list(), env = base_env())
+
+group_by_(.data, ..., .dots = list(), add = FALSE)
+
+group_indices_(.data, ..., .dots = list())
+
+filter_(.data, ..., .dots = list())
+
+slice_(.data, ..., .dots = list())
+
+summarise_(.data, ..., .dots = list())
+
+summarize_(.data, ..., .dots = list())
+
+mutate_(.data, ..., .dots = list())
+
+transmute_(.data, ..., .dots = list())
+
+arrange_(.data, ..., .dots = list())
+
+select_(.data, ..., .dots = list())
+
+rename_(.data, ..., .dots = list())
+
+select_vars_(vars, args, include = character(), exclude = character())
+
+rename_vars_(vars, args)
+}
+\arguments{
+\item{x}{a \code{\link[=tbl]{tbl()}} to tally/count.}
+
+\item{wt}{(Optional) If omitted, will count the number of rows. If
+specified, will perform a "weighted" tally by summing the
+(non-missing) values of variable \code{wt}. This argument is
+automatically \link[rlang:quo]{quoted} and later
+\link[rlang:eval_tidy]{evaluated} in the context of the data
+frame. It supports \link[rlang:quasiquotation]{unquoting}. See
+\code{vignette("programming")} for an introduction to these concepts.}
+
+\item{sort}{if \code{TRUE} will sort output in descending order of \code{n}}
+
+\item{vars}{Various meanings depending on the verb.}
+
+\item{.data}{A data frame.}
+
+\item{.keep_all}{If \code{TRUE}, keep all variables in \code{.data}.
+If a combination of \code{...} is not distinct, this keeps the
+first row of values.}
+
+\item{dots, .dots, ...}{Pair/values of expressions coercible to lazy objects.}
+
+\item{args}{Various meanings depending on the verb.}
+
+\item{env}{The environment in which functions should be evaluated.}
+
+\item{add}{When \code{add = FALSE}, the default, \code{group_by()} will
+override existing groups. To add to the existing groups, use
+\code{add = TRUE}.}
+
+\item{include}{Character vector of column names to always
+include/exclude.}
+
+\item{exclude}{Character vector of column names to always
+include/exclude.}
+}
+\description{
+dplyr used to offer twin versions of each verb suffixed with an
+underscore. These versions had standard evaluation (SE) semantics:
+rather than taking arguments by code, like NSE verbs, they took
+arguments by value. Their purpose was to make it possible to
+program with dplyr. However, dplyr now uses tidy evaluation
+semantics. NSE verbs still capture their arguments, but you can now
+unquote parts of these arguments. This offers full programmability
+with NSE verbs. Thus, the underscored versions are now superfluous.
+}
+\details{
+Unquoting triggers immediate evaluation of its operand and inlines
+the result within the captured expression. This result can be a
+value or an expression to be evaluated later with the rest of the
+argument. See \code{vignette("programming")} for more information.
+}
+\keyword{internal}
diff --git a/man/select.Rd b/man/select.Rd
index e96f615..d69e318 100644
--- a/man/select.Rd
+++ b/man/select.Rd
@@ -1,94 +1,108 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/manip.r
\name{select}
-\alias{rename}
-\alias{rename_}
\alias{select}
-\alias{select_}
-\title{Select/rename variables by name.}
+\alias{rename}
+\title{Select/rename variables by name}
\usage{
select(.data, ...)
-select_(.data, ..., .dots)
-
rename(.data, ...)
-
-rename_(.data, ..., .dots)
}
\arguments{
\item{.data}{A tbl. All main verbs are S3 generics and provide methods
-for \code{\link{tbl_df}}, \code{\link[dtplyr]{tbl_dt}} and \code{\link{tbl_sql}}.}
+for \code{\link[=tbl_df]{tbl_df()}}, \code{\link[dtplyr:tbl_dt]{dtplyr::tbl_dt()}} and \code{\link[dbplyr:tbl_dbi]{dbplyr::tbl_dbi()}}.}
-\item{...}{Comma separated list of unquoted expressions. You can treat
-variable names like they are positions. Use positive values to select
-variables; use negative values to drop variables.}
+\item{...}{One or more unquoted expressions separated by commas.
+You can treat variable names like they are positions.
-\item{.dots}{Use \code{select_()} to do standard evaluation. See
-\code{vignette("nse")} for details}
+Positive values select variables; negative values to drop variables.
+If the first expression is negative, \code{select()} will automatically
+start with all variables.
+
+Use named arguments to rename selected variables.
+
+These arguments are automatically \link[rlang:quo]{quoted} and
+\link[rlang:eval_tidy]{evaluated} in a context where column names
+represent column positions. They support
+\link[rlang:quasiquotation]{unquoting} and splicing. See
+\code{vignette("programming")} for an introduction to these concepts.}
}
\value{
An object of the same class as \code{.data}.
-
- Data frame row names are silently dropped. To preserve, convert to an
- explicit variable.
}
\description{
\code{select()} keeps only the variables you mention; \code{rename()}
keeps all variables.
}
-\section{Special functions}{
+\section{Useful functions}{
-As well as using existing functions like \code{:} and \code{c}, there are
+As well as using existing functions like \code{:} and \code{c()}, there are
a number of special functions that only work inside \code{select}
+\itemize{
+\item \code{\link[=starts_with]{starts_with()}}, \code{\link[=ends_with]{ends_with()}}, \code{\link[=contains]{contains()}}
+\item \code{\link[=matches]{matches()}}
+\item \code{\link[=num_range]{num_range()}}
+}
+
+To drop variables, use \code{-}.
+
+Note that except for \code{:}, \code{-} and \code{c()}, all complex expressions
+are evaluated outside the data frame context. This is to prevent
+accidental matching of data frame variables when you refer to
+variables from the calling context.
+}
+
+\section{Scoped selection and renaming}{
+
+The three \link{scoped} variants of \code{select()} (\code{\link[=select_all]{select_all()}},
+\code{\link[=select_if]{select_if()}} and \code{\link[=select_at]{select_at()}}) and the three variants of
+\code{rename()} (\code{\link[=rename_all]{rename_all()}}, \code{\link[=rename_if]{rename_if()}}, \code{\link[=rename_at]{rename_at()}}) make it
+easy to apply a renaming function to a selection of variables.
+}
+
+\section{Tidy data}{
-To drop variables, use \code{-}. You can rename variables with
-named arguments.
+When applied to a data frame, row names are silently dropped. To preserve,
+convert to an explicit variable with \code{\link[tibble:rownames_to_column]{tibble::rownames_to_column()}}.
}
+
\examples{
-iris <- tbl_df(iris) # so it prints a little nicer
+iris <- as_tibble(iris) # so it prints a little nicer
select(iris, starts_with("Petal"))
select(iris, ends_with("Width"))
-select(iris, contains("etal"))
-select(iris, matches(".t."))
-select(iris, Petal.Length, Petal.Width)
-vars <- c("Petal.Length", "Petal.Width")
-select(iris, one_of(vars))
+
+# Move Species variable to the front
+select(iris, Species, everything())
df <- as.data.frame(matrix(runif(100), nrow = 10))
df <- tbl_df(df[c(3, 4, 7, 1, 9, 8, 5, 2, 6, 10)])
select(df, V4:V6)
select(df, num_range("V", 4:6))
-# Drop variables
+# Drop variables with -
select(iris, -starts_with("Petal"))
-select(iris, -ends_with("Width"))
-select(iris, -contains("etal"))
-select(iris, -matches(".t."))
-select(iris, -Petal.Length, -Petal.Width)
-# Rename variables:
+
+# The .data pronoun is available:
+select(mtcars, .data$cyl)
+select(mtcars, .data$mpg : .data$disp)
+
+# However it isn't available within calls since those are evaluated
+# outside of the data context. This would fail if run:
+# select(mtcars, identical(.data$cyl))
+
+
+# Renaming -----------------------------------------
# * select() keeps only the variables you specify
select(iris, petal_length = Petal.Length)
-# Renaming multiple variables uses a prefix:
-select(iris, petal = starts_with("Petal"))
-
-# Reorder variables: keep the variable "Species" in the front
-select(iris, Species, everything())
# * rename() keeps all variables
rename(iris, petal_length = Petal.Length)
-
-# Programming with select ---------------------------------------------------
-select_(iris, ~Petal.Length)
-select_(iris, "Petal.Length")
-select_(iris, lazyeval::interp(~matches(x), x = ".t."))
-select_(iris, quote(-Petal.Length), quote(-Petal.Width))
-select_(iris, .dots = list(quote(-Petal.Length), quote(-Petal.Width)))
}
\seealso{
-Other single.table.verbs: \code{\link{arrange}},
+Other single table verbs: \code{\link{arrange}},
\code{\link{filter}}, \code{\link{mutate}},
\code{\link{slice}}, \code{\link{summarise}}
}
-
diff --git a/man/select_all.Rd b/man/select_all.Rd
new file mode 100644
index 0000000..4acfbcb
--- /dev/null
+++ b/man/select_all.Rd
@@ -0,0 +1,70 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/colwise-select.R
+\name{select_all}
+\alias{select_all}
+\alias{rename_all}
+\alias{select_if}
+\alias{rename_if}
+\alias{select_at}
+\alias{rename_at}
+\title{Select and rename a selection of variables}
+\usage{
+select_all(.tbl, .funs = list(), ...)
+
+rename_all(.tbl, .funs = list(), ...)
+
+select_if(.tbl, .predicate, .funs = list(), ...)
+
+rename_if(.tbl, .predicate, .funs = list(), ...)
+
+select_at(.tbl, .vars, .funs = list(), ...)
+
+rename_at(.tbl, .vars, .funs = list(), ...)
+}
+\arguments{
+\item{.tbl}{A \code{tbl} object.}
+
+\item{.funs}{A single expression quoted with \code{\link[=funs]{funs()}} or within a
+quosure, a string naming a function, or a function.}
+
+\item{...}{Additional arguments for the function calls in
+\code{.funs}. These are evaluated only once, with \link[rlang:dots_list]{explicit
+splicing}.}
+
+\item{.predicate}{A predicate function to be applied to the columns
+or a logical vector. The variables for which \code{.predicate} is or
+returns \code{TRUE} are selected. This argument is passed to
+\code{\link[rlang:as_function]{rlang::as_function()}} and thus supports quosure-style lambda
+functions and strings representing function names.}
+
+\item{.vars}{A list of columns generated by \code{\link[=vars]{vars()}},
+or a character vector of column names, or a numeric vector of column
+positions.}
+}
+\description{
+These \link{scoped} variants of \code{\link[=select]{select()}} and \code{\link[=rename]{rename()}} operate on a
+selection of variables. The semantics of these verbs have simple
+but important differences:
+\itemize{
+\item Selection drops variables that are not in the selection while
+renaming retains them.
+\item The renaming function is optional for selection but not for
+renaming.
+}
+}
+\examples{
+# Supply a renaming function:
+select_all(mtcars, toupper)
+select_all(mtcars, "toupper")
+select_all(mtcars, funs(toupper(.)))
+
+# Selection drops unselected variables:
+is_whole <- function(x) all(floor(x) == x)
+select_if(mtcars, is_whole, toupper)
+
+# But renaming retains them:
+rename_if(mtcars, is_whole, toupper)
+
+# The renaming function is optional for selection:
+select_if(mtcars, is_whole)
+}
diff --git a/man/select_helpers.Rd b/man/select_helpers.Rd
index 28b29b2..2bbb431 100644
--- a/man/select_helpers.Rd
+++ b/man/select_helpers.Rd
@@ -1,15 +1,15 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/select-utils.R
\name{select_helpers}
-\alias{contains}
+\alias{select_helpers}
\alias{current_vars}
+\alias{starts_with}
\alias{ends_with}
-\alias{everything}
+\alias{contains}
\alias{matches}
\alias{num_range}
\alias{one_of}
-\alias{select_helpers}
-\alias{starts_with}
+\alias{everything}
\title{Select helpers}
\usage{
current_vars()
@@ -35,7 +35,7 @@ everything(vars = current_vars())
names.}
\item{vars}{A character vector of variable names. When called from inside
-\code{\link{select}()} these are automatically set to the names of the
+\code{\link[=select]{select()}} these are automatically set to the names of the
table.}
\item{prefix}{A prefix that starts the numeric range.}
@@ -48,18 +48,18 @@ a range of 2 gives "01", a range of three "001", etc.}
\item{...}{One or more character vectors.}
}
\value{
-An integer vector given the position of the matched variables.
+An integer vector giving the position of the matched variables.
}
\description{
These functions allow you to select variables based on their names.
\itemize{
- \item \code{starts_with()}: starts with a prefix
- \item \code{ends_with()}: ends with a prefix
- \item \code{contains()}: contains a literal string
- \item \code{matches()}: matches a regular expression
- \item \code{num_range()}: a numerical range like x01, x02, x03.
- \item \code{one_of()}: variables in character vector.
- \item \code{everything()}: all variables.
+\item \code{starts_with()}: starts with a prefix
+\item \code{ends_with()}: ends with a prefix
+\item \code{contains()}: contains a literal string
+\item \code{matches()}: matches a regular expression
+\item \code{num_range()}: a numerical range like x01, x02, x03.
+\item \code{one_of()}: variables in character vector.
+\item \code{everything()}: all variables.
}
}
\examples{
@@ -73,4 +73,3 @@ select(iris, everything())
vars <- c("Petal.Length", "Petal.Width")
select(iris, one_of(vars))
}
-
diff --git a/man/select_if.Rd b/man/select_if.Rd
deleted file mode 100644
index 6f5a373..0000000
--- a/man/select_if.Rd
+++ /dev/null
@@ -1,32 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/manip.r
-\name{select_if}
-\alias{select_if}
-\title{Select columns using a predicate}
-\usage{
-select_if(.data, .predicate, ...)
-}
-\arguments{
-\item{.data}{A local tbl source.}
-
-\item{.predicate}{A predicate function to be applied to the columns
-or a logical vector. The columns for which \code{.predicate} is
-or returns \code{TRUE} will be summarised or mutated.}
-
-\item{...}{Additional arguments passed to \code{.predicate}.}
-}
-\description{
-This verb is analogous to \code{\link{summarise_if}()} and
-\code{\link{mutate_if}()} in that it lets you use a predicate on
-the columns of a data frame. Only those columns for which the
-predicate returns \code{TRUE} will be selected.
-}
-\details{
-Predicates can only be used with local sources like a data frame.
-}
-\examples{
-iris \%>\% select_if(is.factor)
-iris \%>\% select_if(is.numeric)
-iris \%>\% select_if(function(col) is.numeric(col) && mean(col) > 3.5)
-}
-
diff --git a/man/select_var.Rd b/man/select_var.Rd
new file mode 100644
index 0000000..da002e9
--- /dev/null
+++ b/man/select_var.Rd
@@ -0,0 +1,49 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/select-var.R
+\name{select_var}
+\alias{select_var}
+\title{Select variable}
+\usage{
+select_var(vars, var = -1)
+}
+\arguments{
+\item{vars}{A character vector of existing column names.}
+
+\item{var}{A variable specified as:
+\itemize{
+\item a literal variable name
+\item a positive integer, giving the position counting from the left
+\item a negative integer, giving the position counting from the right.
+}
+
+The default returns the last column (on the assumption that's the
+column you've created most recently).
+
+This argument is taken by expression and supports
+\link[rlang:quasiquotation]{quasiquotation} (you can unquote column
+names and column positions).}
+}
+\value{
+The selected column name as an unnamed string.
+}
+\description{
+This function powers \code{\link[=pull]{pull()}} and various functions of the tidyr
+package. It is similar to \code{\link[=select_vars]{select_vars()}} but returns only one
+column name and has slightly different semantics: it allows
+negative numbers to select columns from the end.
+}
+\examples{
+# It takes its argument by expression:
+select_var(letters, c)
+
+# Negative numbers select from the end:
+select_var(letters, -3)
+
+# You can unquote variables:
+var <- 10
+select_var(letters, !! var)
+}
+\seealso{
+\code{\link[=pull]{pull()}}, \code{\link[=select_vars]{select_vars()}}
+}
+\keyword{internal}
diff --git a/man/select_vars.Rd b/man/select_vars.Rd
index 321322f..2b97258 100644
--- a/man/select_vars.Rd
+++ b/man/select_vars.Rd
@@ -1,35 +1,51 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/select-vars.R
\name{select_vars}
-\alias{rename_vars}
-\alias{rename_vars_}
\alias{select_vars}
-\alias{select_vars_}
+\alias{rename_vars}
\title{Select variables.}
\usage{
select_vars(vars, ..., include = character(), exclude = character())
-select_vars_(vars, args, include = character(), exclude = character())
-
-rename_vars(vars, ...)
-
-rename_vars_(vars, args)
+rename_vars(vars, ..., strict = TRUE)
}
\arguments{
\item{vars}{A character vector of existing column names.}
-\item{..., args}{Expressions to compute. \code{select_vars} and
-\code{rename_vars}}
+\item{..., args}{Expressions to compute
+
+These arguments are automatically \link[rlang:quo]{quoted} and
+\link[rlang:eval_tidy]{evaluated} in a context where elements of
+\code{vars} are objects representing their positions within
+\code{vars}. They support \link[rlang:quasiquotation]{unquoting} and
+splicing. See \code{vignette("programming")} for an introduction to
+these concepts.
+
+Note that except for \code{:}, \code{-} and \code{c()}, all complex expressions
+are evaluated outside that context. This is to prevent accidental
+matching to \code{vars} elements when you refer to variables from the
+calling context.}
\item{include, exclude}{Character vector of column names to always
include/exclude.}
+
+\item{strict}{If \code{TRUE}, will throw an error if you attempt to rename a
+variable that doesn't exist.}
}
\value{
A named character vector. Values are existing column names,
- names are new names.
+names are new names.
}
\description{
-These functions power \code{\link{select}()} and \code{\link{rename}()}.
+These functions power \code{\link[=select]{select()}} and \code{\link[=rename]{rename()}}.
+}
+\details{
+For historic reasons, the \code{vars} and \code{include} arguments are not
+prefixed with \code{.}. This means that any argument starting with \code{v}
+might partial-match on \code{vars} if it is not explicitly named. Also
+\code{...} cannot accept arguments named \code{exclude} or \code{include}. You can
+enquose and splice the dots to work around these limitations (see
+examples).
}
\examples{
# Keep variables
@@ -59,11 +75,33 @@ select_vars(names(iris), petal = starts_with("Petal"))
# Rename variables preserving all existing
rename_vars(names(iris), petal_length = Petal.Length)
-# Standard evaluation -------------------------------------------------------
-# You can use names, calls, formulas (or lists of), or a character vector
-select_vars_(names(iris), list(~Petal.Length))
-select_vars_(names(iris), list(quote(Petal.Length)))
-select_vars_(names(iris), "Petal.Length")
+# You can unquote names or formulas (or lists of)
+select_vars(names(iris), !!! list(quo(Petal.Length)))
+select_vars(names(iris), !! quote(Petal.Length))
+
+# The .data pronoun is available:
+select_vars(names(mtcars), .data$cyl)
+select_vars(names(mtcars), .data$mpg : .data$disp)
+
+# However it isn't available within calls since those are evaluated
+# outside of the data context. This would fail if run:
+# select_vars(names(mtcars), identical(.data$cyl))
+
+
+# If you're writing a wrapper around select_vars(), pass the dots
+# via splicing to avoid matching dotted arguments to select_vars()
+# named arguments (`vars`, `include` and `exclude`):
+wrapper <- function(...) {
+ select_vars(names(mtcars), !!! quos(...))
}
-\keyword{internal}
+# This won't partial-match on `vars`:
+wrapper(var = cyl)
+
+# This won't match on `include`:
+wrapper(include = cyl)
+}
+\seealso{
+\code{\link[=select_var]{select_var()}}
+}
+\keyword{internal}
diff --git a/man/setops.Rd b/man/setops.Rd
index aa60894..bd99f24 100644
--- a/man/setops.Rd
+++ b/man/setops.Rd
@@ -1,13 +1,13 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/sets.r
\name{setops}
-\alias{intersect}
-\alias{setdiff}
-\alias{setequal}
\alias{setops}
+\alias{intersect}
\alias{union}
\alias{union_all}
-\title{Set operations.}
+\alias{setdiff}
+\alias{setequal}
+\title{Set operations}
\usage{
intersect(x, y, ...)
@@ -42,4 +42,3 @@ setdiff(second, first)
union_all(first, second)
setequal(mtcars, mtcars[32:1, ])
}
-
diff --git a/man/slice.Rd b/man/slice.Rd
index da1f9c2..82db8ea 100644
--- a/man/slice.Rd
+++ b/man/slice.Rd
@@ -2,27 +2,32 @@
% Please edit documentation in R/manip.r
\name{slice}
\alias{slice}
-\alias{slice_}
-\title{Select rows by position.}
+\title{Select rows by position}
\usage{
slice(.data, ...)
-
-slice_(.data, ..., .dots)
}
\arguments{
-\item{.data}{A tbl. All main verbs are S3 generics and provide methods
-for \code{\link{tbl_df}}, \code{\link[dtplyr]{tbl_dt}} and \code{\link{tbl_sql}}.}
+\item{.data}{A tbl.}
-\item{...}{Integer row values}
+\item{...}{Integer row values.
-\item{.dots}{Used to work around non-standard evaluation. See
-\code{vignette("nse")} for details.}
+These arguments are automatically \link[rlang:quo]{quoted} and
+\link[rlang:eval_tidy]{evaluated} in the context of the data
+frame. They support \link[rlang:quasiquotation]{unquoting} and
+splicing. See \code{vignette("programming")} for an introduction to
+these concepts.}
}
\description{
Slice does not work with relational databases because they have no
intrinsic notion of row order. If you want to perform the equivalent
-operation, use \code{\link{filter}()} and \code{\link{row_number}()}.
+operation, use \code{\link[=filter]{filter()}} and \code{\link[=row_number]{row_number()}}.
+}
+\section{Tidy data}{
+
+When applied to a data frame, row names are silently dropped. To preserve,
+convert to an explicit variable with \code{\link[tibble:rownames_to_column]{tibble::rownames_to_column()}}.
}
+
\examples{
slice(mtcars, 1L)
slice(mtcars, n())
@@ -39,8 +44,7 @@ filter(mtcars, row_number() == n())
filter(mtcars, between(row_number(), 5, n()))
}
\seealso{
-Other single.table.verbs: \code{\link{arrange}},
+Other single table verbs: \code{\link{arrange}},
\code{\link{filter}}, \code{\link{mutate}},
\code{\link{select}}, \code{\link{summarise}}
}
-
diff --git a/man/sql.Rd b/man/sql.Rd
index 10fda74..73fdbd8 100644
--- a/man/sql.Rd
+++ b/man/sql.Rd
@@ -1,63 +1,17 @@
% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/sql-escape.r
+% Please edit documentation in R/compat-dbplyr.R
\name{sql}
-\alias{escape}
-\alias{ident}
-\alias{is.ident}
-\alias{is.sql}
\alias{sql}
-\alias{sql_vector}
\title{SQL escaping.}
\usage{
sql(...)
-
-ident(...)
-
-is.sql(x)
-
-is.ident(x)
-
-escape(x, parens = NA, collapse = " ", con = NULL)
-
-sql_vector(x, parens = NA, collapse = " ", con = NULL)
}
\arguments{
\item{...}{Character vectors that will be combined into a single SQL
-expression. \code{ident} flags its input as a identifier, to ensure that
-it gets the correct quoting.}
-
-\item{x}{An object to escape. Existing sql vectors will be left as is,
-character vectors are escaped with single quotes, numeric vectors have
-trailing \code{.0} added if they're whole numbers, identifiers are
-escaped with double quotes.}
-
-\item{parens, collapse}{Controls behaviour when multiple values are supplied.
- \code{parens} should be a logical flag, or if \code{NA}, will wrap in
- parens if length > 1.
-
- Default behaviour: lists are always wrapped in parens and separated by
- commas, identifiers are separated by commas and never wrapped,
- atomic vectors are separated by spaces and wrapped in parens if needed.}
+expression.}
}
\description{
These functions are critical when writing functions that translate R
functions to sql functions. Typically a conversion function should escape
-all it's inputs and return an sql object.
+all its inputs and return an sql object.
}
-\examples{
-# Doubles vs. integers
-escape(1:5)
-escape(c(1, 5.4))
-
-# String vs known sql vs. sql identifier
-escape("X")
-escape(sql("X"))
-escape(ident("X"))
-
-# Escaping is idempotent
-escape("X")
-escape(escape("X"))
-escape(escape(escape("X")))
-}
-\keyword{internal}
-
diff --git a/man/sql_build.Rd b/man/sql_build.Rd
deleted file mode 100644
index b51ccdd..0000000
--- a/man/sql_build.Rd
+++ /dev/null
@@ -1,53 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/sql-build.R, R/sql-query.R, R/sql-render.R
-\name{sql_build}
-\alias{join_query}
-\alias{select_query}
-\alias{semi_join_query}
-\alias{set_op_query}
-\alias{sql_build}
-\alias{sql_render}
-\title{Build and render SQL from a sequence of lazy operations}
-\usage{
-sql_build(op, con, ...)
-
-select_query(from, select = sql("*"), where = character(),
- group_by = character(), having = character(), order_by = character(),
- limit = NULL, distinct = FALSE)
-
-join_query(x, y, type = "inner", by = NULL, suffix = c(".x", ".y"))
-
-semi_join_query(x, y, anti = FALSE, by = NULL)
-
-set_op_query(x, y, type = type)
-
-sql_render(query, con = NULL, ...)
-}
-\arguments{
-\item{op}{A sequence of lazy operations}
-
-\item{con}{A database connection. The default \code{NULL} uses a set of
-rules that should be very similar to ANSI 92, and allows for testing
-without an active database connection.}
-
-\item{...}{Other arguments passed on to the methods. Not currently used.}
-}
-\description{
-\code{sql_build} creates a \code{select_query} S3 object, that is rendered
-to a SQL string by \code{sql_render}. The output from \code{sql_build} is
-designed to be easy to test, as it's database diagnostic, and has
-a hierarchical structure.
-}
-\details{
-\code{sql_build} is generic over the lazy operations, \link{lazy_ops},
-and generates an S3 object that represents the query. \code{sql_render}
-takes a query object and then calls a function that is generic
-over the database. For example, \code{sql_build.op_mutate} generates
-a \code{select_query}, and \code{sql_render.select_query} calls
-\code{sql_select}, which has different methods for different databases.
-The default methods should generate ANSI 92 SQL where possible, so you
-backends only need to override the methods if the backend is not ANSI
-compliant.
-}
-\keyword{internal}
-
diff --git a/man/sql_quote.Rd b/man/sql_quote.Rd
deleted file mode 100644
index 63f15b0..0000000
--- a/man/sql_quote.Rd
+++ /dev/null
@@ -1,24 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/sql-escape.r
-\name{sql_quote}
-\alias{sql_quote}
-\title{Helper function for quoting sql elements.}
-\usage{
-sql_quote(x, quote)
-}
-\arguments{
-\item{x}{Character vector to escape.}
-
-\item{quote}{Single quoting character.}
-}
-\description{
-If the quote character is present in the string, it will be doubled.
-\code{NA}s will be replaced with NULL.
-}
-\examples{
-sql_quote("abc", "'")
-sql_quote("I've had a good day", "'")
-sql_quote(c("abc", NA), "'")
-}
-\keyword{internal}
-
diff --git a/man/sql_variant.Rd b/man/sql_variant.Rd
deleted file mode 100644
index 916f01b..0000000
--- a/man/sql_variant.Rd
+++ /dev/null
@@ -1,94 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/translate-sql-helpers.r, R/translate-sql-base.r
-\docType{data}
-\name{sql_variant}
-\alias{base_agg}
-\alias{base_no_win}
-\alias{base_scalar}
-\alias{base_win}
-\alias{sql_infix}
-\alias{sql_not_supported}
-\alias{sql_prefix}
-\alias{sql_translator}
-\alias{sql_variant}
-\title{Create an sql translator}
-\usage{
-sql_variant(scalar = sql_translator(), aggregate = sql_translator(),
- window = sql_translator())
-
-sql_translator(..., .funs = list(), .parent = new.env(parent = emptyenv()))
-
-sql_infix(f)
-
-sql_prefix(f, n = NULL)
-
-sql_not_supported(f)
-
-'base_scalar'
-
-'base_agg'
-
-'base_win'
-
-'base_no_win'
-}
-\arguments{
-\item{scalar, aggregate, window}{The three families of functions than an
-SQL variant can supply.}
-
-\item{..., .funs}{named functions, used to add custom converters from standard
-R functions to sql functions. Specify individually in \code{...}, or
-provide a list of \code{.funs}}
-
-\item{.parent}{the sql variant that this variant should inherit from.
-Defaults to \code{base_sql} which provides a standard set of
-mappings for the most common operators and functions.}
-
-\item{f}{the name of the sql function as a string}
-
-\item{n}{for \code{sql_infix}, an optional number of arguments to expect.
-Will signal error if not correct.}
-}
-\description{
-When creating a package that maps to a new SQL based src, you'll often
-want to provide some additional mappings from common R commands to the
-commands that your tbl provides. These three functions make that
-easy.
-}
-\section{Helper functions}{
-
-
-\code{sql_infix} and \code{sql_prefix} create default SQL infix and prefix
-functions given the name of the SQL function. They don't perform any input
-checking, but do correctly escape their input, and are useful for
-quickly providing default wrappers for a new SQL variant.
-}
-\examples{
-# An example of adding some mappings for the statistical functions that
-# postgresql provides: http://bit.ly/K5EdTn
-
-postgres_agg <- sql_translator(.parent = base_agg,
- cor = sql_prefix("corr"),
- cov = sql_prefix("covar_samp"),
- sd = sql_prefix("stddev_samp"),
- var = sql_prefix("var_samp")
-)
-postgres_var <- sql_variant(
- base_scalar,
- postgres_agg
-)
-
-translate_sql(cor(x, y), variant = postgres_var)
-translate_sql(sd(income / years), variant = postgres_var)
-
-# Any functions not explicitly listed in the converter will be translated
-# to sql as is, so you don't need to convert all functions.
-translate_sql(regr_intercept(y, x), variant = postgres_var)
-}
-\seealso{
-\code{\link{sql}} for an example of a more customised sql
- conversion function.
-}
-\keyword{datasets}
-\keyword{internal}
-
diff --git a/man/src-test.Rd b/man/src-test.Rd
deleted file mode 100644
index 76cf65e..0000000
--- a/man/src-test.Rd
+++ /dev/null
@@ -1,28 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/src-test.r
-\name{src-test}
-\alias{db_query_fields.DBITestConnection}
-\alias{sql_escape_ident.DBITestConnection}
-\alias{sql_translate_env.DBITestConnection}
-\alias{src-test}
-\title{A set of DBI methods to ease unit testing dplyr with DBI}
-\usage{
-\method{db_query_fields}{DBITestConnection}(con, sql, ...)
-
-\method{sql_escape_ident}{DBITestConnection}(con, x)
-
-\method{sql_translate_env}{DBITestConnection}(con)
-}
-\arguments{
-\item{con}{A database connection.}
-
-\item{sql}{A string containing an sql query.}
-
-\item{...}{Other arguments passed on to the individual methods}
-
-\item{x}{Object to transform}
-}
-\description{
-A set of DBI methods to ease unit testing dplyr with DBI
-}
-
diff --git a/man/src.Rd b/man/src.Rd
index 3b18657..1cc6f1d 100644
--- a/man/src.Rd
+++ b/man/src.Rd
@@ -1,8 +1,8 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/src.r
\name{src}
-\alias{is.src}
\alias{src}
+\alias{is.src}
\title{Create a "src" object}
\usage{
src(subclass, ...)
@@ -14,12 +14,13 @@ is.src(x)
must supply this value. \code{src_} is automatically prepended to the
class name}
-\item{...}{fields used by object}
+\item{...}{fields used by object.
+
+These dots are evaluated with \link[rlang:dots_list]{explicit splicing}.}
\item{x}{object to test for "src"-ness.}
}
\description{
-\code{src} is the standard constructor for srcs and \code{is.src} tests.
+\code{src()} is the standard constructor for srcs and \code{is.src()} tests.
}
\keyword{internal}
-
diff --git a/man/src_dbi.Rd b/man/src_dbi.Rd
new file mode 100644
index 0000000..ad3cacf
--- /dev/null
+++ b/man/src_dbi.Rd
@@ -0,0 +1,121 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/src_dbi.R
+\name{src_dbi}
+\alias{src_dbi}
+\alias{src_mysql}
+\alias{src_postgres}
+\alias{src_sqlite}
+\title{Source for database backends}
+\usage{
+src_mysql(dbname, host = NULL, port = 0L, username = "root",
+ password = "", ...)
+
+src_postgres(dbname = NULL, host = NULL, port = NULL, user = NULL,
+ password = NULL, ...)
+
+src_sqlite(path, create = FALSE)
+}
+\arguments{
+\item{dbname}{Database name}
+
+\item{host, port}{Host name and port number of database}
+
+\item{...}{for the src, other arguments passed on to the underlying
+database connector, \code{\link[DBI:dbConnect]{DBI::dbConnect()}}. For the tbl, included for
+compatibility with the generic, but otherwise ignored.}
+
+\item{user, username, password}{User name and password.
+
+Generally, you should avoid saving username and password in your
+scripts as it is easy to accidentally expose valuable credentials.
+Instead, retrieve them from environment variables, or use database
+specific credential scores. For example, with MySQL you can set up \code{my.cnf}
+as described in \code{\link[RMySQL:MySQL]{RMySQL::MySQL()}}.}
+
+\item{path}{Path to SQLite database. You can use the special path
+":memory:" to create a temporary in memory database.}
+
+\item{create}{if \code{FALSE}, \code{path} must already exist. If
+\code{TRUE}, will create a new SQLite3 database at \code{path} if
+\code{path} does not exist and connect to the existing database if
+\code{path} does exist.}
+}
+\value{
+An S3 object with class \code{src_dbi}, \code{src_sql}, \code{src}.
+}
+\description{
+For backward compatibility dplyr provides three srcs for popular
+open source databases:
+\itemize{
+\item \code{src_mysql()} connects to a MySQL or MariaDB database using \code{\link[RMySQL:MySQL]{RMySQL::MySQL()}}.
+\item \code{src_postgres()} connects to PostgreSQL using \code{\link[RPostgreSQL:PostgreSQL]{RPostgreSQL::PostgreSQL()}}
+\item \code{src_sqlite()} to connect to a SQLite database using \code{\link[RSQLite:SQLite]{RSQLite::SQLite()}}.
+}
+
+However, modern best practice is to use \code{\link[=tbl]{tbl()}} directly on an \code{DBIConnection}.
+}
+\details{
+All data manipulation on SQL tbls are lazy: they will not actually
+run the query or retrieve the data unless you ask for it: they all return
+a new \code{tbl_dbi} object. Use \code{\link[=compute]{compute()}} to run the query and save the
+results in a temporary in the database, or use \code{\link[=collect]{collect()}} to retrieve the
+results to R. You can see the query with \code{\link[=show_query]{show_query()}}.
+
+For best performance, the database should have an index on the variables
+that you are grouping by. Use \code{\link[=explain]{explain()}} to check that the database is using
+the indexes that you expect.
+
+There is one exception: \code{\link[=do]{do()}} is not lazy since it must pull the data
+into R.
+}
+\examples{
+# Basic connection using DBI -------------------------------------------
+if (require(dbplyr, quietly = TRUE)) {
+
+con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
+copy_to(con, mtcars)
+
+DBI::dbListTables(con)
+
+# To retrieve a single table from a source, use `tbl()`
+con \%>\% tbl("mtcars")
+
+# You can also use pass raw SQL if you want a more sophisticated query
+con \%>\% tbl(sql("SELECT * FROM mtcars WHERE cyl == 8"))
+
+# To show off the full features of dplyr's database integration,
+# we'll use the Lahman database. lahman_sqlite() takes care of
+# creating the database.
+lahman_p <- lahman_sqlite()
+batting <- lahman_p \%>\% tbl("Batting")
+batting
+
+# Basic data manipulation verbs work in the same way as with a tibble
+batting \%>\% filter(yearID > 2005, G > 130)
+batting \%>\% select(playerID:lgID)
+batting \%>\% arrange(playerID, desc(yearID))
+batting \%>\% summarise(G = mean(G), n = n())
+
+# There are a few exceptions. For example, databases give integer results
+# when dividing one integer by another. Multiply by 1 to fix the problem
+batting \%>\%
+ select(playerID:lgID, AB, R, G) \%>\%
+ mutate(
+ R_per_game1 = R / G,
+ R_per_game2 = R * 1.0 / G
+ )
+
+# All operations are lazy: they don't do anything until you request the
+# data, either by `print()`ing it (which shows the first ten rows),
+# or by `collect()`ing the results locally.
+system.time(recent <- filter(batting, yearID > 2010))
+system.time(collect(recent))
+
+# You can see the query that dplyr creates with show_query()
+batting \%>\%
+ filter(G > 0) \%>\%
+ group_by(playerID) \%>\%
+ summarise(n = n()) \%>\%
+ show_query()
+}
+}
diff --git a/man/src_local.Rd b/man/src_local.Rd
index a02dea5..9abb464 100644
--- a/man/src_local.Rd
+++ b/man/src_local.Rd
@@ -1,8 +1,8 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/src-local.r
\name{src_local}
-\alias{src_df}
\alias{src_local}
+\alias{src_df}
\title{A local source.}
\usage{
src_local(tbl, pkg = NULL, env = NULL)
@@ -20,7 +20,7 @@ This is mainly useful for testing, since makes it possible to refer to
local and remote tables using exactly the same syntax.
}
\details{
-Generally, \code{src_local} should not be called directly, but instead
+Generally, \code{src_local()} should not be called directly, but instead
one of the constructors should be used.
}
\examples{
@@ -29,4 +29,3 @@ batting_df <- tbl(src_df("Lahman"), "Batting")
}
}
\keyword{internal}
-
diff --git a/man/src_memdb.Rd b/man/src_memdb.Rd
deleted file mode 100644
index 0ab2a8b..0000000
--- a/man/src_memdb.Rd
+++ /dev/null
@@ -1,34 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/src-sqlite.r
-\name{src_memdb}
-\alias{memdb_frame}
-\alias{src_memdb}
-\title{Per-session in-memory SQLite databases.}
-\usage{
-src_memdb()
-
-memdb_frame(..., .name = random_table_name())
-}
-\arguments{
-\item{...}{A set of name-value pairs. Arguments are evaluated sequentially,
-so you can refer to previously created variables.}
-
-\item{.name}{Name of table in database: defaults to a random name that's
-unlikely to conflict with exist}
-}
-\description{
-\code{src_memdb} lets you easily access a sessio-temporary in-memory
-SQLite database. \code{memdb_frame()} works like \code{\link{data_frame}},
-but instead of creating a new data frame in R, it creates a table in
-\code{src_memdb}
-}
-\examples{
-if (require("RSQLite")) {
-src_memdb()
-
-df <- memdb_frame(x = runif(100), y = runif(100))
-df \%>\% arrange(x)
-df \%>\% arrange(x) \%>\% show_query()
-}
-}
-
diff --git a/man/src_mysql.Rd b/man/src_mysql.Rd
deleted file mode 100644
index 65e6fc8..0000000
--- a/man/src_mysql.Rd
+++ /dev/null
@@ -1,171 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/src-mysql.r
-\name{src_mysql}
-\alias{src_mysql}
-\alias{tbl.src_mysql}
-\title{Connect to mysql/mariadb.}
-\usage{
-src_mysql(dbname, host = NULL, port = 0L, user = "root", password = "",
- ...)
-
-\method{tbl}{src_mysql}(src, from, ...)
-}
-\arguments{
-\item{dbname}{Database name}
-
-\item{host, port}{Host name and port number of database}
-
-\item{user, password}{User name and password. Rather than supplying a
-username and password here, it's better to save them in \code{my.cnf},
-as described in \code{\link[RMySQL]{MySQL}}. In that case, supply
-\code{NULL} to both \code{user} and \code{password}.}
-
-\item{...}{for the src, other arguments passed on to the underlying
-database connector, \code{dbConnect}. For the tbl, included for
-compatibility with the generic, but otherwise ignored.}
-
-\item{src}{a mysql src created with \code{src_mysql}.}
-
-\item{from}{Either a string giving the name of table in database, or
-\code{\link{sql}} described a derived table or compound join.}
-}
-\description{
-Use \code{src_mysql} to connect to an existing mysql or mariadb database,
-and \code{tbl} to connect to tables within that database.
-If you are running a local mysqlql database, leave all parameters set as
-their defaults to connect. If you're connecting to a remote database,
-ask your database administrator for the values of these variables.
-}
-\section{Debugging}{
-
-
-To see exactly what SQL is being sent to the database, you see
-\code{\link{show_query}} and \code{\link{explain}}.
-}
-
-\section{Grouping}{
-
-
-Typically you will create a grouped data table is to call the \code{group_by}
-method on a mysql tbl: this will take care of capturing
-the unevalated expressions for you.
-
-For best performance, the database should have an index on the variables
-that you are grouping by. Use \code{\link{explain}} to check that
-the database is using the indexes that you expect.
-}
-
-\section{Output}{
-
-
-All data manipulation on SQL tbls are lazy: they will not actually
-run the query or retrieve the data unless you ask for it: they all return
-a new \code{\link{tbl_sql}} object. Use \code{\link{compute}} to run the
-query and save the results in a temporary in the database, or use
-\code{\link{collect}} to retrieve the results to R.
-
-Note that \code{do} is not lazy since it must pull the data into R.
-It returns a \code{\link{tbl_df}} or \code{\link{grouped_df}}, with one
-column for each grouping variable, and one list column that contains the
-results of the operation. \code{do} never simplifies its output.
-}
-
-\section{Query principles}{
-
-
-This section attempts to lay out the principles governing the generation
-of SQL queries from the manipulation verbs. The basic principle is that
-a sequence of operations should return the same value (modulo class)
-regardless of where the data is stored.
-
-\itemize{
- \item \code{arrange(arrange(df, x), y)} should be equivalent to
- \code{arrange(df, y, x)}
-
- \item \code{select(select(df, a:x), n:o)} should be equivalent to
- \code{select(df, n:o)}
-
- \item \code{mutate(mutate(df, x2 = x * 2), y2 = y * 2)} should be
- equivalent to \code{mutate(df, x2 = x * 2, y2 = y * 2)}
-
- \item \code{filter(filter(df, x == 1), y == 2)} should be
- equivalent to \code{filter(df, x == 1, y == 2)}
-
- \item \code{summarise} should return the summarised output with
- one level of grouping peeled off.
-}
-}
-\examples{
-\dontrun{
-# Connection basics ---------------------------------------------------------
-# To connect to a database first create a src:
-my_db <- src_mysql(host = "blah.com", user = "hadley",
- password = "pass")
-# Then reference a tbl within that src
-my_tbl <- tbl(my_db, "my_table")
-}
-
-# Here we'll use the Lahman database: to create your own local copy,
-# create a local database called "lahman", or tell lahman_mysql() how to
-# a database that you can write to
-
-if (!has_lahman("postgres") && has_lahman("mysql")) {
-lahman_m <- lahman_mysql()
-# Methods -------------------------------------------------------------------
-batting <- tbl(lahman_m, "Batting")
-dim(batting)
-colnames(batting)
-head(batting)
-
-# Data manipulation verbs ---------------------------------------------------
-filter(batting, yearID > 2005, G > 130)
-select(batting, playerID:lgID)
-arrange(batting, playerID, desc(yearID))
-summarise(batting, G = mean(G), n = n())
-mutate(batting, rbi2 = 1.0 * R / AB)
-
-# note that all operations are lazy: they don't do anything until you
-# request the data, either by `print()`ing it (which shows the first ten
-# rows), by looking at the `head()`, or `collect()` the results locally.
-
-system.time(recent <- filter(batting, yearID > 2010))
-system.time(collect(recent))
-
-# Group by operations -------------------------------------------------------
-# To perform operations by group, create a grouped object with group_by
-players <- group_by(batting, playerID)
-group_size(players)
-
-# MySQL doesn't support windowed functions, which means that only
-# grouped summaries are really useful:
-summarise(players, mean_g = mean(G), best_ab = max(AB))
-
-# When you group by multiple level, each summarise peels off one level
-per_year <- group_by(batting, playerID, yearID)
-stints <- summarise(per_year, stints = max(stint))
-filter(ungroup(stints), stints > 3)
-summarise(stints, max(stints))
-
-# Joins ---------------------------------------------------------------------
-player_info <- select(tbl(lahman_m, "Master"), playerID,
- birthYear)
-hof <- select(filter(tbl(lahman_m, "HallOfFame"), inducted == "Y"),
- playerID, votedBy, category)
-
-# Match players and their hall of fame data
-inner_join(player_info, hof)
-# Keep all players, match hof data where available
-left_join(player_info, hof)
-# Find only players in hof
-semi_join(player_info, hof)
-# Find players not in hof
-anti_join(player_info, hof)
-
-# Arbitrary SQL -------------------------------------------------------------
-# You can also provide sql as is, using the sql function:
-batting2008 <- tbl(lahman_m,
- sql("SELECT * FROM Batting WHERE YearID = 2008"))
-batting2008
-}
-}
-
diff --git a/man/src_postgres.Rd b/man/src_postgres.Rd
deleted file mode 100644
index 646e181..0000000
--- a/man/src_postgres.Rd
+++ /dev/null
@@ -1,173 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/src-postgres.r
-\name{src_postgres}
-\alias{src_postgres}
-\alias{tbl.src_postgres}
-\title{Connect to postgresql.}
-\usage{
-src_postgres(dbname = NULL, host = NULL, port = NULL, user = NULL,
- password = NULL, ...)
-
-\method{tbl}{src_postgres}(src, from, ...)
-}
-\arguments{
-\item{dbname}{Database name}
-
-\item{host, port}{Host name and port number of database}
-
-\item{user, password}{User name and password (if needed)}
-
-\item{...}{for the src, other arguments passed on to the underlying
-database connector, \code{dbConnect}. For the tbl, included for
-compatibility with the generic, but otherwise ignored.}
-
-\item{src}{a postgres src created with \code{src_postgres}.}
-
-\item{from}{Either a string giving the name of table in database, or
-\code{\link{sql}} described a derived table or compound join.}
-}
-\description{
-Use \code{src_postgres} to connect to an existing postgresql database,
-and \code{tbl} to connect to tables within that database.
-If you are running a local postgresql database, leave all parameters set as
-their defaults to connect. If you're connecting to a remote database,
-ask your database administrator for the values of these variables.
-}
-\section{Debugging}{
-
-
-To see exactly what SQL is being sent to the database, you see
-\code{\link{show_query}} and \code{\link{explain}}.
-}
-
-\section{Grouping}{
-
-
-Typically you will create a grouped data table is to call the \code{group_by}
-method on a mysql tbl: this will take care of capturing
-the unevalated expressions for you.
-
-For best performance, the database should have an index on the variables
-that you are grouping by. Use \code{\link{explain}} to check that
-the database is using the indexes that you expect.
-}
-
-\section{Output}{
-
-
-All data manipulation on SQL tbls are lazy: they will not actually
-run the query or retrieve the data unless you ask for it: they all return
-a new \code{\link{tbl_sql}} object. Use \code{\link{compute}} to run the
-query and save the results in a temporary in the database, or use
-\code{\link{collect}} to retrieve the results to R.
-
-Note that \code{do} is not lazy since it must pull the data into R.
-It returns a \code{\link{tbl_df}} or \code{\link{grouped_df}}, with one
-column for each grouping variable, and one list column that contains the
-results of the operation. \code{do} never simplifies its output.
-}
-
-\section{Query principles}{
-
-
-This section attempts to lay out the principles governing the generation
-of SQL queries from the manipulation verbs. The basic principle is that
-a sequence of operations should return the same value (modulo class)
-regardless of where the data is stored.
-
-\itemize{
- \item \code{arrange(arrange(df, x), y)} should be equivalent to
- \code{arrange(df, y, x)}
-
- \item \code{select(select(df, a:x), n:o)} should be equivalent to
- \code{select(df, n:o)}
-
- \item \code{mutate(mutate(df, x2 = x * 2), y2 = y * 2)} should be
- equivalent to \code{mutate(df, x2 = x * 2, y2 = y * 2)}
-
- \item \code{filter(filter(df, x == 1), y == 2)} should be
- equivalent to \code{filter(df, x == 1, y == 2)}
-
- \item \code{summarise} should return the summarised output with
- one level of grouping peeled off.
-}
-}
-\examples{
-\dontrun{
-# Connection basics ---------------------------------------------------------
-# To connect to a database first create a src:
-my_db <- src_postgres(host = "blah.com", user = "hadley",
- password = "pass")
-# Then reference a tbl within that src
-my_tbl <- tbl(my_db, "my_table")
-}
-
-# Here we'll use the Lahman database: to create your own local copy,
-# create a local database called "lahman", or tell lahman_postgres() how to
-# access a database that you can write to
-
-if (has_lahman("postgres")) {
-lahman_p <- lahman_postgres()
-# Methods -------------------------------------------------------------------
-batting <- tbl(lahman_p, "Batting")
-dim(batting)
-colnames(batting)
-head(batting)
-
-# Data manipulation verbs ---------------------------------------------------
-filter(batting, yearID > 2005, G > 130)
-select(batting, playerID:lgID)
-arrange(batting, playerID, desc(yearID))
-summarise(batting, G = mean(G), n = n())
-mutate(batting, rbi2 = if(is.null(AB)) 1.0 * R / AB else 0)
-
-# note that all operations are lazy: they don't do anything until you
-# request the data, either by `print()`ing it (which shows the first ten
-# rows), by looking at the `head()`, or `collect()` the results locally.
-
-system.time(recent <- filter(batting, yearID > 2010))
-system.time(collect(recent))
-
-# Group by operations -------------------------------------------------------
-# To perform operations by group, create a grouped object with group_by
-players <- group_by(batting, playerID)
-group_size(players)
-
-summarise(players, mean_g = mean(G), best_ab = max(AB))
-best_year <- filter(players, AB == max(AB) | G == max(G))
-best_year
-
-progress <- mutate(players,
- cyear = yearID - min(yearID) + 1,
- ab_rank = rank(desc(AB)),
- cumulative_ab = order_by(yearID, cumsum(AB)))
-
-# When you group by multiple level, each summarise peels off one level
-per_year <- group_by(batting, playerID, yearID)
-stints <- summarise(per_year, stints = max(stint))
-filter(stints, stints > 3)
-summarise(stints, max(stints))
-mutate(stints, order_by(yearID, cumsum(stints)))
-
-# Joins ---------------------------------------------------------------------
-player_info <- select(tbl(lahman_p, "Master"), playerID, birthYear)
-hof <- select(filter(tbl(lahman_p, "HallOfFame"), inducted == "Y"),
- playerID, votedBy, category)
-
-# Match players and their hall of fame data
-inner_join(player_info, hof)
-# Keep all players, match hof data where available
-left_join(player_info, hof)
-# Find only players in hof
-semi_join(player_info, hof)
-# Find players not in hof
-anti_join(player_info, hof)
-
-# Arbitrary SQL -------------------------------------------------------------
-# You can also provide sql as is, using the sql function:
-batting2008 <- tbl(lahman_p,
- sql('SELECT * FROM "Batting" WHERE "yearID" = 2008'))
-batting2008
-}
-}
-
diff --git a/man/src_sql.Rd b/man/src_sql.Rd
deleted file mode 100644
index 8a265c3..0000000
--- a/man/src_sql.Rd
+++ /dev/null
@@ -1,22 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/src-sql.r
-\name{src_sql}
-\alias{src_sql}
-\title{Create a "sql src" object}
-\usage{
-src_sql(subclass, con, ...)
-}
-\arguments{
-\item{subclass}{name of subclass. "src_sql" is an abstract base class, so you
-must supply this value. \code{src_} is automatically prepended to the
-class name}
-
-\item{con}{the connection object}
-
-\item{...}{fields used by object}
-}
-\description{
-\code{src_sql} is the standard constructor for all SQL based srcs.
-}
-\keyword{internal}
-
diff --git a/man/src_sqlite.Rd b/man/src_sqlite.Rd
deleted file mode 100644
index 6416dc8..0000000
--- a/man/src_sqlite.Rd
+++ /dev/null
@@ -1,168 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/src-sqlite.r
-\name{src_sqlite}
-\alias{src_sqlite}
-\alias{tbl.src_sqlite}
-\title{Connect to a sqlite database.}
-\usage{
-src_sqlite(path, create = FALSE)
-
-\method{tbl}{src_sqlite}(src, from, ...)
-}
-\arguments{
-\item{path}{Path to SQLite database}
-
-\item{create}{if \code{FALSE}, \code{path} must already exist. If
-\code{TRUE}, will create a new SQlite3 database at \code{path} if
-\code{path} does not exist and connect to the existing database if
-\code{path} does exist.}
-
-\item{src}{a sqlite src created with \code{src_sqlite}.}
-
-\item{from}{Either a string giving the name of table in database, or
-\code{\link{sql}} described a derived table or compound join.}
-
-\item{...}{Included for compatibility with the generic, but otherwise
-ignored.}
-}
-\description{
-Use \code{src_sqlite} to connect to an existing sqlite database,
-and \code{tbl} to connect to tables within that database.
-If you are running a local sqliteql database, leave all parameters set as
-their defaults to connect. If you're connecting to a remote database,
-ask your database administrator for the values of these variables.
-\code{\link{src_memdb}} is an easy way to use an in-memory SQLite database
-that is scoped to the current session.
-}
-\section{Debugging}{
-
-
-To see exactly what SQL is being sent to the database, you see
-\code{\link{show_query}} and \code{\link{explain}}.
-}
-
-\section{Grouping}{
-
-
-Typically you will create a grouped data table is to call the \code{group_by}
-method on a mysql tbl: this will take care of capturing
-the unevalated expressions for you.
-
-For best performance, the database should have an index on the variables
-that you are grouping by. Use \code{\link{explain}} to check that
-the database is using the indexes that you expect.
-}
-
-\section{Output}{
-
-
-All data manipulation on SQL tbls are lazy: they will not actually
-run the query or retrieve the data unless you ask for it: they all return
-a new \code{\link{tbl_sql}} object. Use \code{\link{compute}} to run the
-query and save the results in a temporary in the database, or use
-\code{\link{collect}} to retrieve the results to R.
-
-Note that \code{do} is not lazy since it must pull the data into R.
-It returns a \code{\link{tbl_df}} or \code{\link{grouped_df}}, with one
-column for each grouping variable, and one list column that contains the
-results of the operation. \code{do} never simplifies its output.
-}
-
-\section{Query principles}{
-
-
-This section attempts to lay out the principles governing the generation
-of SQL queries from the manipulation verbs. The basic principle is that
-a sequence of operations should return the same value (modulo class)
-regardless of where the data is stored.
-
-\itemize{
- \item \code{arrange(arrange(df, x), y)} should be equivalent to
- \code{arrange(df, y, x)}
-
- \item \code{select(select(df, a:x), n:o)} should be equivalent to
- \code{select(df, n:o)}
-
- \item \code{mutate(mutate(df, x2 = x * 2), y2 = y * 2)} should be
- equivalent to \code{mutate(df, x2 = x * 2, y2 = y * 2)}
-
- \item \code{filter(filter(df, x == 1), y == 2)} should be
- equivalent to \code{filter(df, x == 1, y == 2)}
-
- \item \code{summarise} should return the summarised output with
- one level of grouping peeled off.
-}
-}
-\examples{
-\dontrun{
-# Connection basics ---------------------------------------------------------
-# To connect to a database first create a src:
-my_db <- src_sqlite(path = tempfile(), create = TRUE)
-# Then reference a tbl within that src
-my_tbl <- tbl(my_db, "my_table")
-}
-
-# Here we'll use the Lahman database: to create your own local copy,
-# run lahman_sqlite()
-
-\dontrun{
-if (requireNamespace("RSQLite") && has_lahman("sqlite")) {
-lahman_s <- lahman_sqlite()
-# Methods -------------------------------------------------------------------
-batting <- tbl(lahman_s, "Batting")
-dim(batting)
-colnames(batting)
-head(batting)
-
-# Data manipulation verbs ---------------------------------------------------
-filter(batting, yearID > 2005, G > 130)
-select(batting, playerID:lgID)
-arrange(batting, playerID, desc(yearID))
-summarise(batting, G = mean(G), n = n())
-mutate(batting, rbi2 = 1.0 * R / AB)
-
-# note that all operations are lazy: they don't do anything until you
-# request the data, either by `print()`ing it (which shows the first ten
-# rows), by looking at the `head()`, or `collect()` the results locally.
-
-system.time(recent <- filter(batting, yearID > 2010))
-system.time(collect(recent))
-
-# Group by operations -------------------------------------------------------
-# To perform operations by group, create a grouped object with group_by
-players <- group_by(batting, playerID)
-group_size(players)
-
-# sqlite doesn't support windowed functions, which means that only
-# grouped summaries are really useful:
-summarise(players, mean_g = mean(G), best_ab = max(AB))
-
-# When you group by multiple level, each summarise peels off one level
-per_year <- group_by(batting, playerID, yearID)
-stints <- summarise(per_year, stints = max(stint))
-filter(ungroup(stints), stints > 3)
-summarise(stints, max(stints))
-
-# Joins ---------------------------------------------------------------------
-player_info <- select(tbl(lahman_s, "Master"), playerID, birthYear)
-hof <- select(filter(tbl(lahman_s, "HallOfFame"), inducted == "Y"),
- playerID, votedBy, category)
-
-# Match players and their hall of fame data
-inner_join(player_info, hof)
-# Keep all players, match hof data where available
-left_join(player_info, hof)
-# Find only players in hof
-semi_join(player_info, hof)
-# Find players not in hof
-anti_join(player_info, hof)
-
-# Arbitrary SQL -------------------------------------------------------------
-# You can also provide sql as is, using the sql function:
-batting2008 <- tbl(lahman_s,
- sql("SELECT * FROM Batting WHERE YearID = 2008"))
-batting2008
-}
-}
-}
-
diff --git a/man/src_tbls.Rd b/man/src_tbls.Rd
index 7028485..57b3f73 100644
--- a/man/src_tbls.Rd
+++ b/man/src_tbls.Rd
@@ -14,4 +14,4 @@ This is a generic method which individual src's will provide methods for.
Most methods will not be documented because it's usually pretty obvious what
possible results will be.
}
-
+\keyword{internal}
diff --git a/man/starwars.Rd b/man/starwars.Rd
new file mode 100644
index 0000000..371d33e
--- /dev/null
+++ b/man/starwars.Rd
@@ -0,0 +1,30 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data-starwars.R
+\docType{data}
+\name{starwars}
+\alias{starwars}
+\title{Starwars characters}
+\format{A tibble with 87 rows and 13 variables:
+\describe{
+\item{name}{Name of the character}
+\item{height}{Height (cm)}
+\item{mass}{Weight (kg)}
+\item{hair_color,skin_color,eye_color}{Hair, skin, and eye colors}
+\item{birth_year}{Year born (BBY = Before Battle of Yavin)}
+\item{gender}{male, female, hermaphrodite, or none.}
+\item{homeworld}{Name of homeworld}
+\item{species}{Name of species}
+\item{films}{List of films the character appeared in}
+\item{vehicles}{List of vehicles the character has piloted}
+\item{starships}{List of starships the character has piloted}
+}}
+\usage{
+starwars
+}
+\description{
+This data comes from SWAPI, the Star Wars API, \url{http://swapi.co/}
+}
+\examples{
+starwars
+}
+\keyword{datasets}
diff --git a/man/storms.Rd b/man/storms.Rd
new file mode 100644
index 0000000..e0488b3
--- /dev/null
+++ b/man/storms.Rd
@@ -0,0 +1,34 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data-storms.R
+\docType{data}
+\name{storms}
+\alias{storms}
+\title{Storm tracks data}
+\format{A tibble with 10,010 observations and 13 variables:
+\describe{
+\item{name}{Storm Name}
+\item{year,month,day}{Date of report}
+\item{hour}{Hour of report (in UTC)}
+\item{lat,long}{Location of storm center}
+\item{status}{Storm classification (Tropical Depression, Tropical Storm,
+or Hurricane)}
+\item{category}{Saffir-Simpson storm category (estimated from wind speed.
+-1 = Tropical Depression, 0 = Tropical Storm)}
+\item{wind}{storm's maximum sustained wind speed (in knots)}
+\item{pressure}{Air pressure at the storm's center (in millibars)}
+\item{ts_diameter}{Diameter of the area experiencing tropical storm strength winds (34 knots or above)}
+\item{hu_diameter}{Diameter of the area experiencing hurricane strength winds (64 knots or above)}
+}}
+\usage{
+storms
+}
+\description{
+This data is a subset of the NOAA Atlantic hurricane database best track
+data, \url{http://www.nhc.noaa.gov/data/#hurdat}. The data includes the
+positions and attributes of 198 tropical storms, measured every six hours
+during the lifetime of a storm.
+}
+\examples{
+storms
+}
+\keyword{datasets}
diff --git a/man/summarise.Rd b/man/summarise.Rd
index 2c190b5..3ae17bc 100644
--- a/man/summarise.Rd
+++ b/man/summarise.Rd
@@ -2,67 +2,91 @@
% Please edit documentation in R/manip.r
\name{summarise}
\alias{summarise}
-\alias{summarise_}
\alias{summarize}
-\alias{summarize_}
-\title{Summarise multiple values to a single value.}
+\title{Reduces multiple values down to a single value}
\usage{
summarise(.data, ...)
-summarise_(.data, ..., .dots)
-
summarize(.data, ...)
-
-summarize_(.data, ..., .dots)
}
\arguments{
\item{.data}{A tbl. All main verbs are S3 generics and provide methods
-for \code{\link{tbl_df}}, \code{\link[dtplyr]{tbl_dt}} and \code{\link{tbl_sql}}.}
+for \code{\link[=tbl_df]{tbl_df()}}, \code{\link[dtplyr:tbl_dt]{dtplyr::tbl_dt()}} and \code{\link[dbplyr:tbl_dbi]{dbplyr::tbl_dbi()}}.}
-\item{...}{Name-value pairs of summary functions like \code{\link{min}()},
-\code{\link{mean}()}, \code{\link{max}()} etc.}
+\item{...}{Name-value pairs of summary functions. The name will be the
+name of the variable in the result. The value should be an expression
+that returns a single value like \code{min(x)}, \code{n()}, or \code{sum(is.na(y))}.
-\item{.dots}{Used to work around non-standard evaluation. See
-\code{vignette("nse")} for details.}
+These arguments are automatically \link[rlang:quo]{quoted} and
+\link[rlang:eval_tidy]{evaluated} in the context of the data
+frame. They support \link[rlang:quasiquotation]{unquoting} and
+splicing. See \code{vignette("programming")} for an introduction to
+these concepts.}
}
\value{
An object of the same class as \code{.data}. One grouping level will
- be dropped.
-
- Data frame row names are silently dropped. To preserve, convert to an
- explicit variable.
+be dropped.
}
\description{
-Summarise multiple values to a single value.
+\code{summarise()} is typically used on grouped data created by \code{\link[=group_by]{group_by()}}.
+The output will have one row for each group.
+}
+\section{Useful functions}{
+
+\itemize{
+\item Center: \code{\link[=mean]{mean()}}, \code{\link[=median]{median()}}
+\item Spread: \code{\link[=sd]{sd()}}, \code{\link[=IQR]{IQR()}}, \code{\link[=mad]{mad()}}
+\item Range: \code{\link[=min]{min()}}, \code{\link[=max]{max()}}, \code{\link[=quantile]{quantile()}}
+\item Position: \code{\link[=first]{first()}}, \code{\link[=last]{last()}}, \code{\link[=nth]{nth()}},
+\item Count: \code{\link[=n]{n()}}, \code{\link[=n_distinct]{n_distinct()}}
+\item Logical: \code{\link[=any]{any()}}, \code{\link[=all]{all()}}
+}
}
+
\section{Backend variations}{
Data frames are the only backend that supports creating a variable and
using it in the same summary. See examples for more details.
}
+
+\section{Tidy data}{
+
+When applied to a data frame, row names are silently dropped. To preserve,
+convert to an explicit variable with \code{\link[tibble:rownames_to_column]{tibble::rownames_to_column()}}.
+}
+
\examples{
-summarise(mtcars, mean(disp))
-summarise(group_by(mtcars, cyl), mean(disp))
-summarise(group_by(mtcars, cyl), m = mean(disp), sd = sd(disp))
+# A summary applied to ungrouped tbl returns a single row
+mtcars \%>\%
+ summarise(mean = mean(disp), n = n())
-# With data frames, you can create and immediately use summaries
-by_cyl <- mtcars \%>\% group_by(cyl)
-by_cyl \%>\% summarise(a = n(), b = a + 1)
+# Usually, you'll want to group first
+mtcars \%>\%
+ group_by(cyl) \%>\%
+ summarise(mean = mean(disp), n = n())
-\dontrun{
-# You can't with data tables or databases
-by_cyl_dt <- mtcars \%>\% dtplyr::tbl_dt() \%>\% group_by(cyl)
-by_cyl_dt \%>\% summarise(a = n(), b = a + 1)
+# Each summary call removes one grouping level (since that group
+# is now just a single row)
+mtcars \%>\%
+ group_by(cyl, vs) \%>\%
+ summarise(cyl_n = n()) \%>\%
+ group_vars()
-by_cyl_db <- src_sqlite(":memory:", create = TRUE) \%>\%
- copy_to(mtcars) \%>\% group_by(cyl)
-by_cyl_db \%>\% summarise(a = n(), b = a + 1)
-}
+# Note that with data frames, newly created summaries immediately
+# overwrite existing variables
+mtcars \%>\%
+ group_by(cyl) \%>\%
+ summarise(disp = mean(disp), sd = sd(disp))
+
+
+# summarise() supports quasiquotation. You can unquote raw
+# expressions or quosures:
+var <- quo(mean(cyl))
+summarise(mtcars, !! var)
}
\seealso{
-Other single.table.verbs: \code{\link{arrange}},
+Other single table verbs: \code{\link{arrange}},
\code{\link{filter}}, \code{\link{mutate}},
\code{\link{select}}, \code{\link{slice}}
}
-
diff --git a/man/summarise_all.Rd b/man/summarise_all.Rd
index 3f7a2f6..0d83bf8 100644
--- a/man/summarise_all.Rd
+++ b/man/summarise_all.Rd
@@ -1,113 +1,130 @@
% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/colwise.R
+% Please edit documentation in R/colwise-mutate.R
\name{summarise_all}
-\alias{mutate_all}
-\alias{mutate_at}
-\alias{mutate_each_q}
-\alias{mutate_if}
\alias{summarise_all}
-\alias{summarise_at}
-\alias{summarise_each_q}
\alias{summarise_if}
+\alias{summarise_at}
\alias{summarize_all}
-\alias{summarize_at}
\alias{summarize_if}
+\alias{summarize_at}
+\alias{mutate_all}
+\alias{mutate_if}
+\alias{mutate_at}
+\alias{transmute_all}
+\alias{transmute_if}
+\alias{transmute_at}
\title{Summarise and mutate multiple columns.}
\usage{
summarise_all(.tbl, .funs, ...)
-mutate_all(.tbl, .funs, ...)
-
summarise_if(.tbl, .predicate, .funs, ...)
-mutate_if(.tbl, .predicate, .funs, ...)
+summarise_at(.tbl, .vars, .funs, ..., .cols = NULL)
-summarise_at(.tbl, .cols, .funs, ...)
+summarize_all(.tbl, .funs, ...)
-mutate_at(.tbl, .cols, .funs, ...)
+summarize_if(.tbl, .predicate, .funs, ...)
-summarize_all(.tbl, .funs, ...)
+summarize_at(.tbl, .vars, .funs, ..., .cols = NULL)
-summarize_at(.tbl, .cols, .funs, ...)
+mutate_all(.tbl, .funs, ...)
-summarize_if(.tbl, .predicate, .funs, ...)
+mutate_if(.tbl, .predicate, .funs, ...)
+
+mutate_at(.tbl, .vars, .funs, ..., .cols = NULL)
+
+transmute_all(.tbl, .funs, ...)
+
+transmute_if(.tbl, .predicate, .funs, ...)
+
+transmute_at(.tbl, .vars, .funs, ..., .cols = NULL)
}
\arguments{
-\item{.tbl}{a tbl}
+\item{.tbl}{A \code{tbl} object.}
+
+\item{.funs}{List of function calls generated by \code{\link[=funs]{funs()}}, or a
+character vector of function names, or simply a function.
-\item{.funs}{List of function calls generated by
-\code{\link{funs}()}, or a character vector of function names, or
-simply a function (only for local sources).}
+Bare formulas are passed to \code{\link[rlang:as_function]{rlang::as_function()}} to create
+purrr-style lambda functions. Note that these lambda prevent
+hybrid evaluation from happening and it is thus more efficient to
+supply functions like \code{mean()} directly rather than in a
+lambda-formula.}
-\item{...}{Additional arguments for the function calls. These are
-evaluated only once.}
+\item{...}{Additional arguments for the function calls in
+\code{.funs}. These are evaluated only once, with \link[rlang:dots_list]{explicit
+splicing}.}
\item{.predicate}{A predicate function to be applied to the columns
-or a logical vector. The columns for which \code{.predicate} is
-or returns \code{TRUE} will be summarised or mutated.}
+or a logical vector. The variables for which \code{.predicate} is or
+returns \code{TRUE} are selected. This argument is passed to
+\code{\link[rlang:as_function]{rlang::as_function()}} and thus supports quosure-style lambda
+functions and strings representing function names.}
-\item{.cols}{A list of columns generated by \code{\link{vars}()},
+\item{.vars}{A list of columns generated by \code{\link[=vars]{vars()}},
or a character vector of column names, or a numeric vector of column
positions.}
+
+\item{.cols}{This argument has been renamed to \code{.vars} to fit
+dplyr's terminology and is deprecated.}
}
\value{
A data frame. By default, the newly created columns have the shortest
- names needed to distinguish the output. To force inclusion of a name,
- even when not needed, name the input (see examples for details).
+names needed to uniquely identify the output. To force inclusion of a name,
+even when not needed, name the input (see examples for details).
}
\description{
-\code{summarise_all()} and \code{mutate_all()} apply the functions
-to all (non-grouping) columns. \code{summarise_at()} and
-\code{mutate_at()} allow you to select columns
-using the same name-based \code{\link{select_helpers}} as with
-\code{\link{select}()}. \code{summarise_if}() and
-\code{mutate_if}() operate on columns for which a predicate returns
-\code{TRUE}. Finally, \code{\link{summarise_each}()} and
-\code{\link{mutate_each}()} are older variants that will be
-deprecated in the future.
+These verbs are \link{scoped} variants of \code{\link[=summarise]{summarise()}}, \code{\link[=mutate]{mutate()}} and
+\code{\link[=transmute]{transmute()}}. They apply operations on a selection of variables.
+\itemize{
+\item \code{summarise_all()}, \code{mutate_all()} and \code{transmute_all()} apply the
+functions to all (non-grouping) columns.
+\item \code{summarise_at()}, \code{mutate_at()} and \code{transmute_at()} allow you to
+select columns using the same name-based \link{select_helpers} just
+like with \code{\link[=select]{select()}}.
+\item \code{summarise_if}(), \code{mutate_if}() and \code{transmute_if()} operate on
+columns for which a predicate returns \code{TRUE}.
+}
}
\examples{
-by_species <- iris \%>\% group_by(Species)
+# The scoped variants of summarise() and mutate() make it easy to
+# apply the same transformation to multiple variables:
-# One function
-by_species \%>\% summarise_all(n_distinct)
-by_species \%>\% summarise_all(mean)
+iris \%>\%
+ group_by(Species) \%>\%
+ summarise_all(mean)
-# Use the _at and _if variants for conditional mapping.
-by_species \%>\% summarise_if(is.numeric, mean)
+# There are three variants.
+# * _all affects every variable
+# * _at affects variables selected with a character vector or vars()
+# * _if affects variables selected with a predicate function:
-# summarise_at() can use select() helpers with the vars() function:
-by_species \%>\% summarise_at(vars(Petal.Width), mean)
-by_species \%>\% summarise_at(vars(matches("Width")), mean)
+starwars \%>\% summarise_at(vars(height:mass), mean, na.rm = TRUE)
+starwars \%>\% summarise_at(c("height", "mass"), mean, na.rm = TRUE)
+starwars \%>\% summarise_if(is.numeric, mean, na.rm = TRUE)
-# You can also specify columns with column names or column positions:
-by_species \%>\% summarise_at(c("Sepal.Width", "Petal.Width"), mean)
-by_species \%>\% summarise_at(c(1, 3), mean)
+# mutate_if is particularly useful for transforming variables from
+# one type to another
+iris \%>\% as_tibble() \%>\% mutate_if(is.factor, as.character)
+iris \%>\% as_tibble() \%>\% mutate_if(is.double, as.integer)
-# You can provide additional arguments. Those are evaluated only once:
-by_species \%>\% summarise_all(mean, trim = 1)
-by_species \%>\% summarise_at(vars(Petal.Width), mean, trim = 1)
+# ---------------------------------------------------------------------------
+# If you want apply multiple transformations, use funs()
+by_species <- iris \%>\% group_by(Species)
-# You can provide an expression or multiple functions with the funs() helper.
-by_species \%>\% mutate_all(funs(. * 0.4))
by_species \%>\% summarise_all(funs(min, max))
-# Note that output variable name must now include function name, in order to
+# Note that output variable name now includes the function name, in order to
# keep things distinct.
-# Function names will be included if .funs has names or whenever multiple
-# functions are used.
-by_species \%>\% mutate_all(funs("in" = . / 2.54))
-by_species \%>\% mutate_all(funs(rg = diff(range(.))))
+# You can express more complex inline transformations using .
+by_species \%>\% mutate_all(funs(. / 2.54))
+
+# Function names will be included if .funs has names or multiple inputs
+by_species \%>\% mutate_all(funs(cm = . / 2.54))
by_species \%>\% summarise_all(funs(med = median))
by_species \%>\% summarise_all(funs(Q3 = quantile), probs = 0.75)
by_species \%>\% summarise_all(c("min", "max"))
-
-# Two functions, continued
-by_species \%>\% summarise_at(vars(Petal.Width, Sepal.Width), funs(min, max))
-by_species \%>\% summarise_at(vars(matches("Width")), funs(min, max))
-
}
\seealso{
-\code{\link{vars}()}, \code{\link{funs}()}
+\code{\link[=vars]{vars()}}, \code{\link[=funs]{funs()}}
}
-
diff --git a/man/summarise_each.Rd b/man/summarise_each.Rd
index 9b8fb29..8c136cb 100644
--- a/man/summarise_each.Rd
+++ b/man/summarise_each.Rd
@@ -1,10 +1,10 @@
% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/colwise.R
+% Please edit documentation in R/colwise-mutate.R
\name{summarise_each}
-\alias{mutate_each}
-\alias{mutate_each_}
\alias{summarise_each}
\alias{summarise_each_}
+\alias{mutate_each}
+\alias{mutate_each_}
\alias{summarize_each}
\alias{summarize_each_}
\title{Summarise and mutate multiple columns.}
@@ -13,36 +13,27 @@ summarise_each(tbl, funs, ...)
summarise_each_(tbl, funs, vars)
-summarize_each(tbl, funs, ...)
-
-summarize_each_(tbl, funs, vars)
-
mutate_each(tbl, funs, ...)
mutate_each_(tbl, funs, vars)
-}
-\arguments{
-\item{tbl}{a tbl}
-\item{funs}{List of function calls, generated by \code{\link{funs}}, or
-a character vector of function names.}
-
-\item{vars, ...}{Variables to include/exclude in mutate/summarise.
- You can use same specifications as in \code{\link{select}}. If missing,
- defaults to all non-grouping variables.
+summarize_each(tbl, funs, ...)
- For standard evaluation versions (ending in \code{_}) these can
- be either a list of expressions or a character vector.}
+summarize_each_(tbl, funs, vars)
}
\description{
-Apply one or more functions to one or more columns. Grouping variables
-are always excluded from modification.
-}
-\details{
-In the future \code{mutate_each()} and \code{summarise_each()} will
-be deprecated in favour of a more featureful family of functions:
-\code{\link{mutate_all}()}, \code{\link{mutate_at}()},
-\code{\link{mutate_if}()}, \code{\link{summarise_all}()},
-\code{\link{summarise_at}()} and \code{\link{summarise_if}()}.
+\code{mutate_each()} and \code{summarise_each()} are deprecated in favour of
+a more featureful family of functions: \code{\link[=mutate_all]{mutate_all()}},
+\code{\link[=mutate_at]{mutate_at()}}, \code{\link[=mutate_if]{mutate_if()}}, \code{\link[=summarise_all]{summarise_all()}}, \code{\link[=summarise_at]{summarise_at()}}
+and \code{\link[=summarise_if]{summarise_if()}}.
+
+The \code{_each()} functions have two replacements depending on what
+variables you want to apply \code{funs} to. To apply a function to all
+variables, use \code{\link[=mutate_all]{mutate_all()}} or \code{\link[=summarise_all]{summarise_all()}}. To apply a
+function to a selection of variables, use \code{\link[=mutate_at]{mutate_at()}} or
+\code{\link[=summarise_at]{summarise_at()}}.
+
+See the relevant section of \code{vignette("compatibility")} for more
+information.
}
-
+\keyword{internal}
diff --git a/man/tally.Rd b/man/tally.Rd
index 4434e89..c0c918e 100644
--- a/man/tally.Rd
+++ b/man/tally.Rd
@@ -1,52 +1,82 @@
% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/tally.R
+% Please edit documentation in R/count-tally.R
\name{tally}
-\alias{count}
-\alias{count_}
\alias{tally}
-\title{Counts/tally observations by group.}
+\alias{count}
+\alias{add_tally}
+\alias{add_count}
+\title{Count/tally observations by group}
\usage{
tally(x, wt, sort = FALSE)
count(x, ..., wt = NULL, sort = FALSE)
-count_(x, vars, wt = NULL, sort = FALSE)
+add_tally(x, wt, sort = FALSE)
+
+add_count(x, ..., wt = NULL, sort = FALSE)
}
\arguments{
-\item{x}{a \code{\link{tbl}} to tally/count.}
+\item{x}{a \code{\link[=tbl]{tbl()}} to tally/count.}
-\item{wt}{(Optional) If omitted, will count the number of rows. If specified,
-will perform a "weighted" tally by summing the (non-missing) values of
-variable \code{wt}.}
+\item{wt}{(Optional) If omitted, will count the number of rows. If
+specified, will perform a "weighted" tally by summing the
+(non-missing) values of variable \code{wt}. This argument is
+automatically \link[rlang:quo]{quoted} and later
+\link[rlang:eval_tidy]{evaluated} in the context of the data
+frame. It supports \link[rlang:quasiquotation]{unquoting}. See
+\code{vignette("programming")} for an introduction to these concepts.}
\item{sort}{if \code{TRUE} will sort output in descending order of \code{n}}
-\item{..., vars}{Variables to group by.}
+\item{...}{Variables to group by.}
}
-\description{
-\code{tally} is a convenient wrapper for summarise that will either call
-\code{\link{n}} or \code{\link{sum}(n)} depending on whether you're tallying
-for the first time, or re-tallying. \code{count()} is similar, but also
-does the \code{\link{group_by}} for you.
+\value{
+A tbl, grouped the same way as \code{x}.
}
-\examples{
-if (require("Lahman")) {
-batting_tbl <- tbl_df(Batting)
-tally(group_by(batting_tbl, yearID))
-tally(group_by(batting_tbl, yearID), sort = TRUE)
-
-# Multiple tallys progressively roll up the groups
-plays_by_year <- tally(group_by(batting_tbl, playerID, stint), sort = TRUE)
-tally(plays_by_year, sort = TRUE)
-tally(tally(plays_by_year))
-
-# This looks a little nicer if you use the infix \%>\% operator
-batting_tbl \%>\% group_by(playerID) \%>\% tally(sort = TRUE)
-
-# count is even more succinct - it also does the grouping for you
-batting_tbl \%>\% count(playerID)
-batting_tbl \%>\% count(playerID, wt = G)
-batting_tbl \%>\% count(playerID, wt = G, sort = TRUE)
+\description{
+\code{tally()} is a convenient wrapper for summarise that will either call
+\code{\link[=n]{n()}} or \code{\link{sum}(n)} depending on whether you're tallying
+for the first time, or re-tallying. \code{count()} is similar but calls
+\code{\link[=group_by]{group_by()}} before and \code{\link[=ungroup]{ungroup()}} after.
+
+\code{add_tally()} adds a column "n" to a table based on the number
+of items within each existing group, while \code{add_count()} is a shortcut that
+does the grouping as well. These functions are to \code{\link[=tally]{tally()}}
+and \code{\link[=count]{count()}} as \code{\link[=mutate]{mutate()}} is to \code{\link[=summarise]{summarise()}}:
+they add an additional column rather than collapsing each group.
}
+\note{
+The column name in the returned data is usually \code{n}, even if you
+have supplied a weight.
+
+If the data already already has a column named \code{n}, the output column
+will be called \code{nn}. If the table already has columns called \code{n} and \code{nn}
+then the column returned will be \code{nnn}, and so on.
+
+There is currently no way to control the output variable name - if you
+need to change the default, you'll have to write the \code{\link[=summarise]{summarise()}}
+yourself.
}
+\examples{
+# tally() is short-hand for summarise()
+mtcars \%>\% tally()
+# count() is a short-hand for group_by() + tally()
+mtcars \%>\% count(cyl)
+# add_tally() is short-hand for mutate()
+mtcars \%>\% add_tally()
+# add_count() is a short-hand for group_by() + add_tally()
+mtcars \%>\% add_count(cyl)
+
+# count and tally are designed so that you can call
+# them repeatedly, each time rolling up a level of detail
+species <- starwars \%>\% count(species, homeworld, sort = TRUE)
+species
+species \%>\% count(species, sort = TRUE)
+
+# add_count() is useful for groupwise filtering
+# e.g.: show only species that have a single member
+starwars \%>\%
+ add_count(species) \%>\%
+ filter(n == 1)
+}
diff --git a/man/tbl.Rd b/man/tbl.Rd
index 7697185..e8ce60c 100644
--- a/man/tbl.Rd
+++ b/man/tbl.Rd
@@ -1,9 +1,9 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/tbl.r
\name{tbl}
-\alias{as.tbl}
-\alias{is.tbl}
\alias{tbl}
+\alias{is.tbl}
+\alias{as.tbl}
\title{Create a table from a data source}
\usage{
tbl(src, ...)
@@ -22,4 +22,3 @@ as.tbl(x, ...)
\description{
This is a generic method that dispatches based on the first argument.
}
-
diff --git a/man/tbl_cube.Rd b/man/tbl_cube.Rd
index 852d080..10b4560 100644
--- a/man/tbl_cube.Rd
+++ b/man/tbl_cube.Rd
@@ -2,13 +2,13 @@
% Please edit documentation in R/tbl-cube.r
\name{tbl_cube}
\alias{tbl_cube}
-\title{A data cube tbl.}
+\title{A data cube tbl}
\usage{
tbl_cube(dimensions, measures)
}
\arguments{
\item{dimensions}{A named list of vectors. A dimension is a variable
-whose values are known before the experiement is conducted; they are
+whose values are known before the experiment is conducted; they are
fixed by design (in \pkg{reshape2} they are known as id variables).
\code{tbl_cubes} are dense which means that almost every combination of
the dimensions should have associated measurements: missing values require
@@ -43,24 +43,25 @@ extension to storing data frames for indices rather than vectors).
Manipulation functions:
\itemize{
- \item \code{select} (M)
+\item \code{select()} (M)
- \item \code{summarise} (M), corresponds to roll-up, but rather more
- limited since there are no hierarchies.
+\item \code{summarise()} (M), corresponds to roll-up, but rather more
+limited since there are no hierarchies.
- \item \code{filter} (D), corresponds to slice/dice.
+\item \code{filter()} (D), corresponds to slice/dice.
- \item \code{mutate} (M) is not implemented, but should be relatively
- straightforward given the implementation of \code{summarise}.
+\item \code{mutate()} (M) is not implemented, but should be relatively
+straightforward given the implementation of \code{summarise}.
- \item \code{arrange} (D?) Not implemented: not obvious how much sense
- it would make
+\item \code{arrange()} (D?) Not implemented: not obvious how much sense
+it would make
}
Joins: not implemented. See \code{vignettes/joins.graffle} for ideas.
Probably straightforward if you get the indexes right, and that's probably
some straightforward array/tensor operation.
}
+
\examples{
# The built in nasa dataset records meterological data (temperature,
# cloud cover, ozone etc) for a 4d spatio-temporal dataset (lat, long,
@@ -94,7 +95,6 @@ by_loc <- group_by(nasa, lat, long)
summarise(by_loc, pressure = max(pressure), temp = mean(temperature))
}
\seealso{
-\code{\link{as.tbl_cube}} for ways of coercing existing data
- structures into a \code{tbl_cube}.
+\code{\link[=as.tbl_cube]{as.tbl_cube()}} for ways of coercing existing data
+structures into a \code{tbl_cube}.
}
-
diff --git a/man/tbl_df.Rd b/man/tbl_df.Rd
index 875c967..8c8c69c 100644
--- a/man/tbl_df.Rd
+++ b/man/tbl_df.Rd
@@ -10,59 +10,6 @@ tbl_df(data)
\item{data}{a data frame}
}
\description{
-Forwards the argument to \code{\link[tibble]{as_data_frame}}, see
-\link{tibble-package} for more details.
+Deprecated: please use \code{\link[tibble:as_tibble]{tibble::as_tibble()}} instead.
}
-\examples{
-ds <- tbl_df(mtcars)
-ds
-as.data.frame(ds)
-
-if (require("Lahman") && packageVersion("Lahman") >= "3.0.1") {
-batting <- tbl_df(Batting)
-dim(batting)
-colnames(batting)
-head(batting)
-
-# Data manipulation verbs ---------------------------------------------------
-filter(batting, yearID > 2005, G > 130)
-select(batting, playerID:lgID)
-arrange(batting, playerID, desc(yearID))
-summarise(batting, G = mean(G), n = n())
-mutate(batting, rbi2 = if(is.null(AB)) 1.0 * R / AB else 0)
-
-# Group by operations -------------------------------------------------------
-# To perform operations by group, create a grouped object with group_by
-players <- group_by(batting, playerID)
-head(group_size(players), 100)
-
-summarise(players, mean_g = mean(G), best_ab = max(AB))
-best_year <- filter(players, AB == max(AB) | G == max(G))
-progress <- mutate(players, cyear = yearID - min(yearID) + 1,
- rank(desc(AB)), cumsum(AB))
-
-# When you group by multiple level, each summarise peels off one level
-\donttest{
-per_year <- group_by(batting, playerID, yearID)
-stints <- summarise(per_year, stints = max(stint))
-filter(stints, stints > 3)
-summarise(stints, max(stints))
-mutate(stints, cumsum(stints))
-}
-
-# Joins ---------------------------------------------------------------------
-player_info <- select(tbl_df(Master), playerID, birthYear)
-hof <- select(filter(tbl_df(HallOfFame), inducted == "Y"),
- playerID, votedBy, category)
-
-# Match players and their hall of fame data
-inner_join(player_info, hof)
-# Keep all players, match hof data where available
-left_join(player_info, hof)
-# Find only players in hof
-semi_join(player_info, hof)
-# Find players not in hof
-anti_join(player_info, hof)
-}
-}
-
+\keyword{internal}
diff --git a/man/tbl_sql.Rd b/man/tbl_sql.Rd
deleted file mode 100644
index 8cc8bde..0000000
--- a/man/tbl_sql.Rd
+++ /dev/null
@@ -1,25 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/tbl-sql.r
-\name{tbl_sql}
-\alias{tbl_sql}
-\title{Create an SQL tbl (abstract)}
-\usage{
-tbl_sql(subclass, src, from, ..., vars = attr(from, "vars"))
-}
-\arguments{
-\item{subclass}{name of subclass}
-
-\item{...}{needed for agreement with generic. Not otherwise used.}
-
-\item{vars}{If known, the names of the variables in the tbl. This is
-relatively expensive to determine automatically, so is cached throughout
-dplyr. However, you should usually be able to leave this blank and it
-will be determined from the context.}
-}
-\description{
-This method shouldn't be called by users - it should only be used by
-backend implementors who are creating backends that extend the basic
-sql behaviour.
-}
-\keyword{internal}
-
diff --git a/man/tbl_vars.Rd b/man/tbl_vars.Rd
index c1aa276..7f70ef2 100644
--- a/man/tbl_vars.Rd
+++ b/man/tbl_vars.Rd
@@ -2,14 +2,22 @@
% Please edit documentation in R/tbl.r
\name{tbl_vars}
\alias{tbl_vars}
+\alias{tbl_nongroup_vars}
\title{List variables provided by a tbl.}
\usage{
tbl_vars(x)
+
+tbl_nongroup_vars(x)
}
\arguments{
\item{x}{A tbl object}
}
\description{
-List variables provided by a tbl.
+\code{tbl_vars()} returns all variables while \code{tbl_nongroup_vars()}
+returns only non-grouping variables.
}
-
+\seealso{
+\code{\link[=group_vars]{group_vars()}} for a function that returns grouping
+variables.
+}
+\keyword{internal}
diff --git a/man/testing.Rd b/man/testing.Rd
deleted file mode 100644
index cd6bbe4..0000000
--- a/man/testing.Rd
+++ /dev/null
@@ -1,32 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/data-temp.r
-\name{testing}
-\alias{test_frame}
-\alias{test_load}
-\alias{test_register_src}
-\alias{testing}
-\title{Infrastructure for testing dplyr}
-\usage{
-test_register_src(name, src)
-
-test_load(df, name = random_table_name(), srcs = test_srcs$get(),
- ignore = character())
-
-test_frame(..., srcs = test_srcs$get(), ignore = character())
-}
-\description{
-Register testing sources, then use \code{test_load} to load an existing
-data frame into each source. To create a new table in each source,
-use \code{test_frame}.
-}
-\examples{
-\dontrun{
-test_register_src("df", src_df(env = new.env()))
-test_register_src("sqlite", src_sqlite(":memory:", create = TRUE))
-
-test_frame(x = 1:3, y = 3:1)
-test_load(mtcars)
-}
-}
-\keyword{internal}
-
diff --git a/man/top_n.Rd b/man/top_n.Rd
index 6daa5d1..eb21f40 100644
--- a/man/top_n.Rd
+++ b/man/top_n.Rd
@@ -2,26 +2,31 @@
% Please edit documentation in R/top-n.R
\name{top_n}
\alias{top_n}
-\title{Select top (or bottom) n rows (by value).}
+\title{Select top (or bottom) n rows (by value)}
\usage{
top_n(x, n, wt)
}
\arguments{
-\item{x}{a \code{\link{tbl}} to filter}
+\item{x}{a \code{\link[=tbl]{tbl()}} to filter}
\item{n}{number of rows to return. If \code{x} is grouped, this is the
- number of rows per group. Will include more than \code{n} rows if
- there are ties.
+number of rows per group. Will include more than \code{n} rows if
+there are ties.
- If \code{n} is positive, selects the top \code{n} rows. If negative,
- selects the bottom \code{n} rows.}
+If \code{n} is positive, selects the top \code{n} rows. If negative,
+selects the bottom \code{n} rows.}
-\item{wt}{(Optional). The variable to use for ordering. If not specified,
-defaults to the last variable in the tbl.}
+\item{wt}{(Optional). The variable to use for ordering. If not
+specified, defaults to the last variable in the tbl.
+
+This argument is automatically \link[rlang:quo]{quoted} and later
+\link[rlang:eval_tidy]{evaluated} in the context of the data
+frame. It supports \link[rlang:quasiquotation]{unquoting}. See
+\code{vignette("programming")} for an introduction to these concepts.}
}
\description{
-This is a convenient wrapper that uses \code{\link{filter}} and
-\code{\link{min_rank}} to select the top or bottom entries in each group,
+This is a convenient wrapper that uses \code{\link[=filter]{filter()}} and
+\code{\link[=min_rank]{min_rank()}} to select the top or bottom entries in each group,
ordered by \code{wt}.
}
\examples{
@@ -45,4 +50,3 @@ tbl_df(Batting) \%>\%
tbl_df(Batting) \%>\% group_by(playerID) \%>\% top_n(1, G)
}
}
-
diff --git a/man/translate_sql.Rd b/man/translate_sql.Rd
deleted file mode 100644
index 126019b..0000000
--- a/man/translate_sql.Rd
+++ /dev/null
@@ -1,128 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/translate-sql.r
-\name{translate_sql}
-\alias{translate_sql}
-\alias{translate_sql_}
-\title{Translate an expression to sql.}
-\usage{
-translate_sql(..., con = NULL, vars = character(), vars_group = NULL,
- vars_order = NULL, window = TRUE)
-
-translate_sql_(dots, con = NULL, vars = character(), vars_group = NULL,
- vars_order = NULL, window = TRUE)
-}
-\arguments{
-\item{..., dots}{Expressions to translate. \code{sql_translate}
-automatically quotes them for you. \code{sql_translate_} expects
-a list of already quoted objects.}
-
-\item{con}{An optional database connection to control the details of
-the translation. The default, \code{NULL}, generates ANSI SQL.}
-
-\item{vars}{A character vector giving variable names in the remote
-data source. If this is supplied, \code{translate_sql} will call
-\code{\link{partial_eval}} to interpolate in the values from local
-variables.}
-
-\item{vars_group, vars_order}{Grouping and ordering variables used for
-windowed functions.}
-
-\item{window}{Use \code{FALSE} to suppress generation of the \code{OVER}
-statement used for window functions. This is necessary when generating
-SQL for a grouped summary.}
-}
-\description{
-Translate an expression to sql.
-}
-\section{Base translation}{
-
-The base translator, \code{base_sql},
-provides custom mappings for \code{!} (to NOT), \code{&&} and \code{&} to
-\code{AND}, \code{||} and \code{|} to \code{OR}, \code{^} to \code{POWER},
-\code{\%>\%} to \code{\%}, \code{ceiling} to \code{CEIL}, \code{mean} to
-\code{AVG}, \code{var} to \code{VARIANCE}, \code{tolower} to \code{LOWER},
-\code{toupper} to \code{UPPER} and \code{nchar} to \code{length}.
-
-\code{c} and \code{:} keep their usual R behaviour so you can easily create
-vectors that are passed to sql.
-
-All other functions will be preserved as is. R's infix functions
-(e.g. \code{\%like\%}) will be converted to their sql equivalents
-(e.g. \code{LIKE}). You can use this to access SQL string concatenation:
-\code{||} is mapped to \code{OR}, but \code{\%||\%} is mapped to \code{||}.
-To suppress this behaviour, and force errors immediately when dplyr doesn't
-know how to translate a function it encounters, using set the
-\code{dplyr.strict_sql} option to \code{TRUE}.
-
-You can also use \code{sql} to insert a raw sql string.
-}
-
-\section{SQLite translation}{
-
-The SQLite variant currently only adds one additional function: a mapping
-from \code{sd} to the SQL aggregation function \code{stdev}.
-}
-\examples{
-# Regular maths is translated in a very straightforward way
-translate_sql(x + 1)
-translate_sql(sin(x) + tan(y))
-
-# Note that all variable names are escaped
-translate_sql(like == "x")
-# In ANSI SQL: "" quotes variable _names_, '' quotes strings
-
-# Logical operators are converted to their sql equivalents
-translate_sql(x < 5 & !(y >= 5))
-# xor() doesn't have a direct SQL equivalent
-translate_sql(xor(x, y))
-
-# If is translated into case when
-translate_sql(if (x > 5) "big" else "small")
-
-# Infix functions are passed onto SQL with \% removed
-translate_sql(first \%like\% "Had*")
-translate_sql(first \%is\% NULL)
-translate_sql(first \%in\% c("John", "Roger", "Robert"))
-
-
-# And be careful if you really want integers
-translate_sql(x == 1)
-translate_sql(x == 1L)
-
-# If you have an already quoted object, use translate_sql_:
-x <- quote(y + 1 / sin(t))
-translate_sql_(list(x))
-
-# Translation with known variables ------------------------------------------
-
-# If the variables in the dataset are known, translate_sql will interpolate
-# in literal values from the current environment
-x <- 10
-translate_sql(mpg > x)
-translate_sql(mpg > x, vars = names(mtcars))
-
-# By default all computations happens in sql
-translate_sql(cyl == 2 + 2, vars = names(mtcars))
-# Use local to force local evaluation
-translate_sql(cyl == local(2 + 2), vars = names(mtcars))
-
-# This is also needed if you call a local function:
-inc <- function(x) x + 1
-translate_sql(mpg > inc(x), vars = names(mtcars))
-translate_sql(mpg > local(inc(x)), vars = names(mtcars))
-
-# Windowed translation --------------------------------------------
-# Known window functions automatically get OVER()
-translate_sql(mpg > mean(mpg))
-
-# Suppress this with window = FALSE
-translate_sql(mpg > mean(mpg), window = FALSE)
-
-# vars_group controls partition:
-translate_sql(mpg > mean(mpg), vars_group = "cyl")
-
-# and vars_order controls ordering for those functions that need it
-translate_sql(cumsum(mpg))
-translate_sql(cumsum(mpg), vars_order = "mpg")
-}
-
diff --git a/man/vars.Rd b/man/vars.Rd
index bc1d149..2763c54 100644
--- a/man/vars.Rd
+++ b/man/vars.Rd
@@ -2,21 +2,31 @@
% Please edit documentation in R/colwise.R
\name{vars}
\alias{vars}
-\title{Select columns}
+\title{Select variables}
\usage{
vars(...)
}
\arguments{
\item{...}{Variables to include/exclude in mutate/summarise. You
-can use same specifications as in \code{\link{select}}. If
-missing, defaults to all non-grouping variables.}
+can use same specifications as in \code{\link[=select]{select()}}. If missing,
+defaults to all non-grouping variables.
+
+These arguments are automatically \link[rlang:quo]{quoted} and later
+\link[rlang:eval_tidy]{evaluated} in the context of the data
+frame. They support \link[rlang:quasiquotation]{unquoting}. See
+\code{vignette("programming")} for an introduction to these concepts.}
}
\description{
-This helper has equivalent semantics to \code{\link{select}()}. Its
-purpose is to provide \code{select()} semantics to the colwise
-summarising and mutating verbs.
+This helper is intended to provide equivalent semantics to
+\code{\link[=select]{select()}}. It is used for instance in scoped summarising and
+mutating verbs (\code{\link[=mutate_at]{mutate_at()}} and \code{\link[=summarise_at]{summarise_at()}}).
+}
+\details{
+Note that verbs accepting a \code{vars()} specification also accept an
+\link[rlang:is_integerish]{integerish} vector of positions or a
+character vector of column names.
}
\seealso{
-\code{\link{summarise_all}()}
+\code{\link[=funs]{funs()}}, \code{\link[=all_vars]{all_vars()}} and \code{\link[=any_vars]{any_vars()}} for other quoting
+functions that you can use with scoped verbs.
}
-
diff --git a/man/with_order.Rd b/man/with_order.Rd
index 2f6d908..bed9347 100644
--- a/man/with_order.Rd
+++ b/man/with_order.Rd
@@ -17,4 +17,3 @@ with_order(order_by, fun, x, ...)
This is used to power the ordering parameters of dplyr's window functions
}
\keyword{internal}
-
diff --git a/src/Makevars b/src/Makevars
index ace606e..9ba38da 100644
--- a/src/Makevars
+++ b/src/Makevars
@@ -1,4 +1,2 @@
-PKG_CPPFLAGS = -I../inst/include -DCOMPILING_DPLYR
-
# Disable long types from C99 or CPP11 extensions
-PKG_CXXFLAGS = -DBOOST_NO_INT64_T -DBOOST_NO_INTEGRAL_INT64_T -DBOOST_NO_LONG_LONG
+PKG_CPPFLAGS = -I../inst/include -DCOMPILING_DPLYR -DBOOST_NO_INT64_T -DBOOST_NO_INTEGRAL_INT64_T -DBOOST_NO_LONG_LONG -DRCPP_USING_UTF8_ERROR_STRING
diff --git a/src/Makevars.win b/src/Makevars.win
index da93f7e..8b38b6c 100644
--- a/src/Makevars.win
+++ b/src/Makevars.win
@@ -1 +1 @@
-PKG_CPPFLAGS = -I../inst/include -DCOMPILING_DPLYR
+PKG_CPPFLAGS = -I../inst/include -DCOMPILING_DPLYR -DRCPP_USING_UTF8_ERROR_STRING
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 8634498..67c5c76 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -1,628 +1,738 @@
-// This file was generated by Rcpp::compileAttributes
+// Generated by using Rcpp::compileAttributes() -> do not edit by hand
// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
#include "../inst/include/dplyr.h"
+#include "../inst/include/dplyr_types.h"
#include <Rcpp.h>
+#include <string>
+#include <set>
using namespace Rcpp;
// loc
CharacterVector loc(RObject data);
-RcppExport SEXP dplyr_loc(SEXP dataSEXP) {
+RcppExport SEXP _dplyr_loc(SEXP dataSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< RObject >::type data(dataSEXP);
- __result = Rcpp::wrap(loc(data));
- return __result;
+ rcpp_result_gen = Rcpp::wrap(loc(data));
+ return rcpp_result_gen;
END_RCPP
}
// dfloc
CharacterVector dfloc(List df);
-RcppExport SEXP dplyr_dfloc(SEXP dfSEXP) {
+RcppExport SEXP _dplyr_dfloc(SEXP dfSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< List >::type df(dfSEXP);
- __result = Rcpp::wrap(dfloc(df));
- return __result;
+ rcpp_result_gen = Rcpp::wrap(dfloc(df));
+ return rcpp_result_gen;
END_RCPP
}
// plfloc
CharacterVector plfloc(Pairlist data);
-RcppExport SEXP dplyr_plfloc(SEXP dataSEXP) {
+RcppExport SEXP _dplyr_plfloc(SEXP dataSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< Pairlist >::type data(dataSEXP);
- __result = Rcpp::wrap(plfloc(data));
- return __result;
+ rcpp_result_gen = Rcpp::wrap(plfloc(data));
+ return rcpp_result_gen;
END_RCPP
}
-// rank_strings
-IntegerVector rank_strings(CharacterVector s);
-RcppExport SEXP dplyr_rank_strings(SEXP sSEXP) {
+// strings_addresses
+CharacterVector strings_addresses(CharacterVector s);
+RcppExport SEXP _dplyr_strings_addresses(SEXP sSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< CharacterVector >::type s(sSEXP);
- __result = Rcpp::wrap(rank_strings(s));
- return __result;
+ rcpp_result_gen = Rcpp::wrap(strings_addresses(s));
+ return rcpp_result_gen;
+END_RCPP
+}
+// gp
+unsigned short gp(SEXP x);
+RcppExport SEXP _dplyr_gp(SEXP xSEXP) {
+BEGIN_RCPP
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
+ Rcpp::traits::input_parameter< SEXP >::type x(xSEXP);
+ rcpp_result_gen = Rcpp::wrap(gp(x));
+ return rcpp_result_gen;
+END_RCPP
+}
+// init_logging
+void init_logging(const std::string& log_level);
+RcppExport SEXP _dplyr_init_logging(SEXP log_levelSEXP) {
+BEGIN_RCPP
+ Rcpp::RNGScope rcpp_rngScope_gen;
+ Rcpp::traits::input_parameter< const std::string& >::type log_level(log_levelSEXP);
+ init_logging(log_level);
+ return R_NilValue;
END_RCPP
}
// arrange_impl
-List arrange_impl(DataFrame data, LazyDots dots);
-RcppExport SEXP dplyr_arrange_impl(SEXP dataSEXP, SEXP dotsSEXP) {
+List arrange_impl(DataFrame data, QuosureList quosures);
+RcppExport SEXP _dplyr_arrange_impl(SEXP dataSEXP, SEXP quosuresSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< DataFrame >::type data(dataSEXP);
- Rcpp::traits::input_parameter< LazyDots >::type dots(dotsSEXP);
- __result = Rcpp::wrap(arrange_impl(data, dots));
- return __result;
+ Rcpp::traits::input_parameter< QuosureList >::type quosures(quosuresSEXP);
+ rcpp_result_gen = Rcpp::wrap(arrange_impl(data, quosures));
+ return rcpp_result_gen;
END_RCPP
}
// between
LogicalVector between(NumericVector x, double left, double right);
-RcppExport SEXP dplyr_between(SEXP xSEXP, SEXP leftSEXP, SEXP rightSEXP) {
+RcppExport SEXP _dplyr_between(SEXP xSEXP, SEXP leftSEXP, SEXP rightSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< NumericVector >::type x(xSEXP);
Rcpp::traits::input_parameter< double >::type left(leftSEXP);
Rcpp::traits::input_parameter< double >::type right(rightSEXP);
- __result = Rcpp::wrap(between(x, left, right));
- return __result;
+ rcpp_result_gen = Rcpp::wrap(between(x, left, right));
+ return rcpp_result_gen;
+END_RCPP
+}
+// flatten_bindable
+SEXP flatten_bindable(SEXP x);
+RcppExport SEXP _dplyr_flatten_bindable(SEXP xSEXP) {
+BEGIN_RCPP
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
+ Rcpp::traits::input_parameter< SEXP >::type x(xSEXP);
+ rcpp_result_gen = Rcpp::wrap(flatten_bindable(x));
+ return rcpp_result_gen;
END_RCPP
}
// bind_rows_
List bind_rows_(List dots, SEXP id);
-RcppExport SEXP dplyr_bind_rows_(SEXP dotsSEXP, SEXP idSEXP) {
+RcppExport SEXP _dplyr_bind_rows_(SEXP dotsSEXP, SEXP idSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< List >::type dots(dotsSEXP);
Rcpp::traits::input_parameter< SEXP >::type id(idSEXP);
- __result = Rcpp::wrap(bind_rows_(dots, id));
- return __result;
-END_RCPP
-}
-// rbind_list__impl
-List rbind_list__impl(Dots dots);
-RcppExport SEXP dplyr_rbind_list__impl(SEXP dotsSEXP) {
-BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
- Rcpp::traits::input_parameter< Dots >::type dots(dotsSEXP);
- __result = Rcpp::wrap(rbind_list__impl(dots));
- return __result;
+ rcpp_result_gen = Rcpp::wrap(bind_rows_(dots, id));
+ return rcpp_result_gen;
END_RCPP
}
// cbind_all
List cbind_all(List dots);
-RcppExport SEXP dplyr_cbind_all(SEXP dotsSEXP) {
+RcppExport SEXP _dplyr_cbind_all(SEXP dotsSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< List >::type dots(dotsSEXP);
- __result = Rcpp::wrap(cbind_all(dots));
- return __result;
+ rcpp_result_gen = Rcpp::wrap(cbind_all(dots));
+ return rcpp_result_gen;
END_RCPP
}
// combine_all
SEXP combine_all(List data);
-RcppExport SEXP dplyr_combine_all(SEXP dataSEXP) {
+RcppExport SEXP _dplyr_combine_all(SEXP dataSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< List >::type data(dataSEXP);
- __result = Rcpp::wrap(combine_all(data));
- return __result;
+ rcpp_result_gen = Rcpp::wrap(combine_all(data));
+ return rcpp_result_gen;
END_RCPP
}
// combine_vars
SEXP combine_vars(CharacterVector vars, ListOf<IntegerVector> xs);
-RcppExport SEXP dplyr_combine_vars(SEXP varsSEXP, SEXP xsSEXP) {
+RcppExport SEXP _dplyr_combine_vars(SEXP varsSEXP, SEXP xsSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< CharacterVector >::type vars(varsSEXP);
Rcpp::traits::input_parameter< ListOf<IntegerVector> >::type xs(xsSEXP);
- __result = Rcpp::wrap(combine_vars(vars, xs));
- return __result;
+ rcpp_result_gen = Rcpp::wrap(combine_vars(vars, xs));
+ return rcpp_result_gen;
END_RCPP
}
// distinct_impl
-SEXP distinct_impl(DataFrame df, CharacterVector vars, CharacterVector keep);
-RcppExport SEXP dplyr_distinct_impl(SEXP dfSEXP, SEXP varsSEXP, SEXP keepSEXP) {
+SEXP distinct_impl(DataFrame df, const SymbolVector& vars, const SymbolVector& keep);
+RcppExport SEXP _dplyr_distinct_impl(SEXP dfSEXP, SEXP varsSEXP, SEXP keepSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< DataFrame >::type df(dfSEXP);
- Rcpp::traits::input_parameter< CharacterVector >::type vars(varsSEXP);
- Rcpp::traits::input_parameter< CharacterVector >::type keep(keepSEXP);
- __result = Rcpp::wrap(distinct_impl(df, vars, keep));
- return __result;
+ Rcpp::traits::input_parameter< const SymbolVector& >::type vars(varsSEXP);
+ Rcpp::traits::input_parameter< const SymbolVector& >::type keep(keepSEXP);
+ rcpp_result_gen = Rcpp::wrap(distinct_impl(df, vars, keep));
+ return rcpp_result_gen;
END_RCPP
}
-// assert_all_white_list
-void assert_all_white_list(const DataFrame& data);
-RcppExport SEXP dplyr_assert_all_white_list(SEXP dataSEXP) {
+// n_distinct_multi
+SEXP n_distinct_multi(List variables, bool na_rm);
+RcppExport SEXP _dplyr_n_distinct_multi(SEXP variablesSEXP, SEXP na_rmSEXP) {
BEGIN_RCPP
- Rcpp::RNGScope __rngScope;
- Rcpp::traits::input_parameter< const DataFrame& >::type data(dataSEXP);
- assert_all_white_list(data);
- return R_NilValue;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
+ Rcpp::traits::input_parameter< List >::type variables(variablesSEXP);
+ Rcpp::traits::input_parameter< bool >::type na_rm(na_rmSEXP);
+ rcpp_result_gen = Rcpp::wrap(n_distinct_multi(variables, na_rm));
+ return rcpp_result_gen;
+END_RCPP
+}
+// filter_impl
+SEXP filter_impl(DataFrame df, NamedQuosure quo);
+RcppExport SEXP _dplyr_filter_impl(SEXP dfSEXP, SEXP quoSEXP) {
+BEGIN_RCPP
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
+ Rcpp::traits::input_parameter< DataFrame >::type df(dfSEXP);
+ Rcpp::traits::input_parameter< NamedQuosure >::type quo(quoSEXP);
+ rcpp_result_gen = Rcpp::wrap(filter_impl(df, quo));
+ return rcpp_result_gen;
END_RCPP
}
+// grouped_df_impl
+DataFrame grouped_df_impl(DataFrame data, SymbolVector symbols, bool drop);
+RcppExport SEXP _dplyr_grouped_df_impl(SEXP dataSEXP, SEXP symbolsSEXP, SEXP dropSEXP) {
+BEGIN_RCPP
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
+ Rcpp::traits::input_parameter< DataFrame >::type data(dataSEXP);
+ Rcpp::traits::input_parameter< SymbolVector >::type symbols(symbolsSEXP);
+ Rcpp::traits::input_parameter< bool >::type drop(dropSEXP);
+ rcpp_result_gen = Rcpp::wrap(grouped_df_impl(data, symbols, drop));
+ return rcpp_result_gen;
+END_RCPP
+}
+// as_regular_df
+DataFrame as_regular_df(DataFrame df);
+RcppExport SEXP _dplyr_as_regular_df(SEXP dfSEXP) {
+BEGIN_RCPP
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
+ Rcpp::traits::input_parameter< DataFrame >::type df(dfSEXP);
+ rcpp_result_gen = Rcpp::wrap(as_regular_df(df));
+ return rcpp_result_gen;
+END_RCPP
+}
+// ungroup_grouped_df
+DataFrame ungroup_grouped_df(DataFrame df);
+RcppExport SEXP _dplyr_ungroup_grouped_df(SEXP dfSEXP) {
+BEGIN_RCPP
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
+ Rcpp::traits::input_parameter< DataFrame >::type df(dfSEXP);
+ rcpp_result_gen = Rcpp::wrap(ungroup_grouped_df(df));
+ return rcpp_result_gen;
+END_RCPP
+}
+// test_grouped_df
+SEXP test_grouped_df(DataFrame data);
+RcppExport SEXP _dplyr_test_grouped_df(SEXP dataSEXP) {
+BEGIN_RCPP
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
+ Rcpp::traits::input_parameter< DataFrame >::type data(dataSEXP);
+ rcpp_result_gen = Rcpp::wrap(test_grouped_df(data));
+ return rcpp_result_gen;
+END_RCPP
+}
+// grouped_indices_grouped_df_impl
+IntegerVector grouped_indices_grouped_df_impl(GroupedDataFrame gdf);
+RcppExport SEXP _dplyr_grouped_indices_grouped_df_impl(SEXP gdfSEXP) {
+BEGIN_RCPP
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
+ Rcpp::traits::input_parameter< GroupedDataFrame >::type gdf(gdfSEXP);
+ rcpp_result_gen = Rcpp::wrap(grouped_indices_grouped_df_impl(gdf));
+ return rcpp_result_gen;
+END_RCPP
+}
+// group_size_grouped_cpp
+IntegerVector group_size_grouped_cpp(GroupedDataFrame gdf);
+RcppExport SEXP _dplyr_group_size_grouped_cpp(SEXP gdfSEXP) {
+BEGIN_RCPP
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
+ Rcpp::traits::input_parameter< GroupedDataFrame >::type gdf(gdfSEXP);
+ rcpp_result_gen = Rcpp::wrap(group_size_grouped_cpp(gdf));
+ return rcpp_result_gen;
+END_RCPP
+}
+// get_date_classes
+SEXP get_date_classes();
+static SEXP _dplyr_get_date_classes_try() {
+BEGIN_RCPP
+ Rcpp::RObject rcpp_result_gen;
+ rcpp_result_gen = Rcpp::wrap(get_date_classes());
+ return rcpp_result_gen;
+END_RCPP_RETURN_ERROR
+}
+RcppExport SEXP _dplyr_get_date_classes() {
+ SEXP rcpp_result_gen;
+ {
+ Rcpp::RNGScope rcpp_rngScope_gen;
+ rcpp_result_gen = PROTECT(_dplyr_get_date_classes_try());
+ }
+ Rboolean rcpp_isInterrupt_gen = Rf_inherits(rcpp_result_gen, "interrupted-error");
+ if (rcpp_isInterrupt_gen) {
+ UNPROTECT(1);
+ Rf_onintr();
+ }
+ Rboolean rcpp_isError_gen = Rf_inherits(rcpp_result_gen, "try-error");
+ if (rcpp_isError_gen) {
+ SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen);
+ UNPROTECT(1);
+ Rf_error(CHAR(rcpp_msgSEXP_gen));
+ }
+ UNPROTECT(1);
+ return rcpp_result_gen;
+}
+// get_time_classes
+SEXP get_time_classes();
+static SEXP _dplyr_get_time_classes_try() {
+BEGIN_RCPP
+ Rcpp::RObject rcpp_result_gen;
+ rcpp_result_gen = Rcpp::wrap(get_time_classes());
+ return rcpp_result_gen;
+END_RCPP_RETURN_ERROR
+}
+RcppExport SEXP _dplyr_get_time_classes() {
+ SEXP rcpp_result_gen;
+ {
+ Rcpp::RNGScope rcpp_rngScope_gen;
+ rcpp_result_gen = PROTECT(_dplyr_get_time_classes_try());
+ }
+ Rboolean rcpp_isInterrupt_gen = Rf_inherits(rcpp_result_gen, "interrupted-error");
+ if (rcpp_isInterrupt_gen) {
+ UNPROTECT(1);
+ Rf_onintr();
+ }
+ Rboolean rcpp_isError_gen = Rf_inherits(rcpp_result_gen, "try-error");
+ if (rcpp_isError_gen) {
+ SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen);
+ UNPROTECT(1);
+ Rf_error(CHAR(rcpp_msgSEXP_gen));
+ }
+ UNPROTECT(1);
+ return rcpp_result_gen;
+}
+// build_index_cpp
+DataFrame build_index_cpp(DataFrame data);
+static SEXP _dplyr_build_index_cpp_try(SEXP dataSEXP) {
+BEGIN_RCPP
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::traits::input_parameter< DataFrame >::type data(dataSEXP);
+ rcpp_result_gen = Rcpp::wrap(build_index_cpp(data));
+ return rcpp_result_gen;
+END_RCPP_RETURN_ERROR
+}
+RcppExport SEXP _dplyr_build_index_cpp(SEXP dataSEXP) {
+ SEXP rcpp_result_gen;
+ {
+ Rcpp::RNGScope rcpp_rngScope_gen;
+ rcpp_result_gen = PROTECT(_dplyr_build_index_cpp_try(dataSEXP));
+ }
+ Rboolean rcpp_isInterrupt_gen = Rf_inherits(rcpp_result_gen, "interrupted-error");
+ if (rcpp_isInterrupt_gen) {
+ UNPROTECT(1);
+ Rf_onintr();
+ }
+ Rboolean rcpp_isError_gen = Rf_inherits(rcpp_result_gen, "try-error");
+ if (rcpp_isError_gen) {
+ SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen);
+ UNPROTECT(1);
+ Rf_error(CHAR(rcpp_msgSEXP_gen));
+ }
+ UNPROTECT(1);
+ return rcpp_result_gen;
+}
// semi_join_impl
-DataFrame semi_join_impl(DataFrame x, DataFrame y, CharacterVector by_x, CharacterVector by_y);
-RcppExport SEXP dplyr_semi_join_impl(SEXP xSEXP, SEXP ySEXP, SEXP by_xSEXP, SEXP by_ySEXP) {
+DataFrame semi_join_impl(DataFrame x, DataFrame y, CharacterVector by_x, CharacterVector by_y, bool na_match);
+RcppExport SEXP _dplyr_semi_join_impl(SEXP xSEXP, SEXP ySEXP, SEXP by_xSEXP, SEXP by_ySEXP, SEXP na_matchSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< DataFrame >::type x(xSEXP);
Rcpp::traits::input_parameter< DataFrame >::type y(ySEXP);
Rcpp::traits::input_parameter< CharacterVector >::type by_x(by_xSEXP);
Rcpp::traits::input_parameter< CharacterVector >::type by_y(by_ySEXP);
- __result = Rcpp::wrap(semi_join_impl(x, y, by_x, by_y));
- return __result;
+ Rcpp::traits::input_parameter< bool >::type na_match(na_matchSEXP);
+ rcpp_result_gen = Rcpp::wrap(semi_join_impl(x, y, by_x, by_y, na_match));
+ return rcpp_result_gen;
END_RCPP
}
// anti_join_impl
-DataFrame anti_join_impl(DataFrame x, DataFrame y, CharacterVector by_x, CharacterVector by_y);
-RcppExport SEXP dplyr_anti_join_impl(SEXP xSEXP, SEXP ySEXP, SEXP by_xSEXP, SEXP by_ySEXP) {
+DataFrame anti_join_impl(DataFrame x, DataFrame y, CharacterVector by_x, CharacterVector by_y, bool na_match);
+RcppExport SEXP _dplyr_anti_join_impl(SEXP xSEXP, SEXP ySEXP, SEXP by_xSEXP, SEXP by_ySEXP, SEXP na_matchSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< DataFrame >::type x(xSEXP);
Rcpp::traits::input_parameter< DataFrame >::type y(ySEXP);
Rcpp::traits::input_parameter< CharacterVector >::type by_x(by_xSEXP);
Rcpp::traits::input_parameter< CharacterVector >::type by_y(by_ySEXP);
- __result = Rcpp::wrap(anti_join_impl(x, y, by_x, by_y));
- return __result;
+ Rcpp::traits::input_parameter< bool >::type na_match(na_matchSEXP);
+ rcpp_result_gen = Rcpp::wrap(anti_join_impl(x, y, by_x, by_y, na_match));
+ return rcpp_result_gen;
END_RCPP
}
// inner_join_impl
-DataFrame inner_join_impl(DataFrame x, DataFrame y, CharacterVector by_x, CharacterVector by_y, std::string& suffix_x, std::string& suffix_y);
-RcppExport SEXP dplyr_inner_join_impl(SEXP xSEXP, SEXP ySEXP, SEXP by_xSEXP, SEXP by_ySEXP, SEXP suffix_xSEXP, SEXP suffix_ySEXP) {
+DataFrame inner_join_impl(DataFrame x, DataFrame y, CharacterVector by_x, CharacterVector by_y, std::string& suffix_x, std::string& suffix_y, bool na_match);
+RcppExport SEXP _dplyr_inner_join_impl(SEXP xSEXP, SEXP ySEXP, SEXP by_xSEXP, SEXP by_ySEXP, SEXP suffix_xSEXP, SEXP suffix_ySEXP, SEXP na_matchSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< DataFrame >::type x(xSEXP);
Rcpp::traits::input_parameter< DataFrame >::type y(ySEXP);
Rcpp::traits::input_parameter< CharacterVector >::type by_x(by_xSEXP);
Rcpp::traits::input_parameter< CharacterVector >::type by_y(by_ySEXP);
Rcpp::traits::input_parameter< std::string& >::type suffix_x(suffix_xSEXP);
Rcpp::traits::input_parameter< std::string& >::type suffix_y(suffix_ySEXP);
- __result = Rcpp::wrap(inner_join_impl(x, y, by_x, by_y, suffix_x, suffix_y));
- return __result;
+ Rcpp::traits::input_parameter< bool >::type na_match(na_matchSEXP);
+ rcpp_result_gen = Rcpp::wrap(inner_join_impl(x, y, by_x, by_y, suffix_x, suffix_y, na_match));
+ return rcpp_result_gen;
END_RCPP
}
// left_join_impl
-DataFrame left_join_impl(DataFrame x, DataFrame y, CharacterVector by_x, CharacterVector by_y, std::string& suffix_x, std::string& suffix_y);
-RcppExport SEXP dplyr_left_join_impl(SEXP xSEXP, SEXP ySEXP, SEXP by_xSEXP, SEXP by_ySEXP, SEXP suffix_xSEXP, SEXP suffix_ySEXP) {
+DataFrame left_join_impl(DataFrame x, DataFrame y, CharacterVector by_x, CharacterVector by_y, std::string& suffix_x, std::string& suffix_y, bool na_match);
+RcppExport SEXP _dplyr_left_join_impl(SEXP xSEXP, SEXP ySEXP, SEXP by_xSEXP, SEXP by_ySEXP, SEXP suffix_xSEXP, SEXP suffix_ySEXP, SEXP na_matchSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< DataFrame >::type x(xSEXP);
Rcpp::traits::input_parameter< DataFrame >::type y(ySEXP);
Rcpp::traits::input_parameter< CharacterVector >::type by_x(by_xSEXP);
Rcpp::traits::input_parameter< CharacterVector >::type by_y(by_ySEXP);
Rcpp::traits::input_parameter< std::string& >::type suffix_x(suffix_xSEXP);
Rcpp::traits::input_parameter< std::string& >::type suffix_y(suffix_ySEXP);
- __result = Rcpp::wrap(left_join_impl(x, y, by_x, by_y, suffix_x, suffix_y));
- return __result;
+ Rcpp::traits::input_parameter< bool >::type na_match(na_matchSEXP);
+ rcpp_result_gen = Rcpp::wrap(left_join_impl(x, y, by_x, by_y, suffix_x, suffix_y, na_match));
+ return rcpp_result_gen;
END_RCPP
}
// right_join_impl
-DataFrame right_join_impl(DataFrame x, DataFrame y, CharacterVector by_x, CharacterVector by_y, std::string& suffix_x, std::string& suffix_y);
-RcppExport SEXP dplyr_right_join_impl(SEXP xSEXP, SEXP ySEXP, SEXP by_xSEXP, SEXP by_ySEXP, SEXP suffix_xSEXP, SEXP suffix_ySEXP) {
+DataFrame right_join_impl(DataFrame x, DataFrame y, CharacterVector by_x, CharacterVector by_y, std::string& suffix_x, std::string& suffix_y, bool na_match);
+RcppExport SEXP _dplyr_right_join_impl(SEXP xSEXP, SEXP ySEXP, SEXP by_xSEXP, SEXP by_ySEXP, SEXP suffix_xSEXP, SEXP suffix_ySEXP, SEXP na_matchSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< DataFrame >::type x(xSEXP);
Rcpp::traits::input_parameter< DataFrame >::type y(ySEXP);
Rcpp::traits::input_parameter< CharacterVector >::type by_x(by_xSEXP);
Rcpp::traits::input_parameter< CharacterVector >::type by_y(by_ySEXP);
Rcpp::traits::input_parameter< std::string& >::type suffix_x(suffix_xSEXP);
Rcpp::traits::input_parameter< std::string& >::type suffix_y(suffix_ySEXP);
- __result = Rcpp::wrap(right_join_impl(x, y, by_x, by_y, suffix_x, suffix_y));
- return __result;
+ Rcpp::traits::input_parameter< bool >::type na_match(na_matchSEXP);
+ rcpp_result_gen = Rcpp::wrap(right_join_impl(x, y, by_x, by_y, suffix_x, suffix_y, na_match));
+ return rcpp_result_gen;
END_RCPP
}
// full_join_impl
-DataFrame full_join_impl(DataFrame x, DataFrame y, CharacterVector by_x, CharacterVector by_y, std::string& suffix_x, std::string& suffix_y);
-RcppExport SEXP dplyr_full_join_impl(SEXP xSEXP, SEXP ySEXP, SEXP by_xSEXP, SEXP by_ySEXP, SEXP suffix_xSEXP, SEXP suffix_ySEXP) {
+DataFrame full_join_impl(DataFrame x, DataFrame y, CharacterVector by_x, CharacterVector by_y, std::string& suffix_x, std::string& suffix_y, bool na_match);
+RcppExport SEXP _dplyr_full_join_impl(SEXP xSEXP, SEXP ySEXP, SEXP by_xSEXP, SEXP by_ySEXP, SEXP suffix_xSEXP, SEXP suffix_ySEXP, SEXP na_matchSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< DataFrame >::type x(xSEXP);
Rcpp::traits::input_parameter< DataFrame >::type y(ySEXP);
Rcpp::traits::input_parameter< CharacterVector >::type by_x(by_xSEXP);
Rcpp::traits::input_parameter< CharacterVector >::type by_y(by_ySEXP);
Rcpp::traits::input_parameter< std::string& >::type suffix_x(suffix_xSEXP);
Rcpp::traits::input_parameter< std::string& >::type suffix_y(suffix_ySEXP);
- __result = Rcpp::wrap(full_join_impl(x, y, by_x, by_y, suffix_x, suffix_y));
- return __result;
+ Rcpp::traits::input_parameter< bool >::type na_match(na_matchSEXP);
+ rcpp_result_gen = Rcpp::wrap(full_join_impl(x, y, by_x, by_y, suffix_x, suffix_y, na_match));
+ return rcpp_result_gen;
END_RCPP
}
-// shallow_copy
-SEXP shallow_copy(const List& data);
-RcppExport SEXP dplyr_shallow_copy(SEXP dataSEXP) {
+// mutate_impl
+SEXP mutate_impl(DataFrame df, QuosureList dots);
+RcppExport SEXP _dplyr_mutate_impl(SEXP dfSEXP, SEXP dotsSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
- Rcpp::traits::input_parameter< const List& >::type data(dataSEXP);
- __result = Rcpp::wrap(shallow_copy(data));
- return __result;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
+ Rcpp::traits::input_parameter< DataFrame >::type df(dfSEXP);
+ Rcpp::traits::input_parameter< QuosureList >::type dots(dotsSEXP);
+ rcpp_result_gen = Rcpp::wrap(mutate_impl(df, dots));
+ return rcpp_result_gen;
+END_RCPP
+}
+// select_impl
+DataFrame select_impl(DataFrame df, CharacterVector vars);
+RcppExport SEXP _dplyr_select_impl(SEXP dfSEXP, SEXP varsSEXP) {
+BEGIN_RCPP
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
+ Rcpp::traits::input_parameter< DataFrame >::type df(dfSEXP);
+ Rcpp::traits::input_parameter< CharacterVector >::type vars(varsSEXP);
+ rcpp_result_gen = Rcpp::wrap(select_impl(df, vars));
+ return rcpp_result_gen;
END_RCPP
}
// compatible_data_frame_nonames
dplyr::BoolResult compatible_data_frame_nonames(DataFrame x, DataFrame y, bool convert);
-RcppExport SEXP dplyr_compatible_data_frame_nonames(SEXP xSEXP, SEXP ySEXP, SEXP convertSEXP) {
+RcppExport SEXP _dplyr_compatible_data_frame_nonames(SEXP xSEXP, SEXP ySEXP, SEXP convertSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< DataFrame >::type x(xSEXP);
Rcpp::traits::input_parameter< DataFrame >::type y(ySEXP);
Rcpp::traits::input_parameter< bool >::type convert(convertSEXP);
- __result = Rcpp::wrap(compatible_data_frame_nonames(x, y, convert));
- return __result;
+ rcpp_result_gen = Rcpp::wrap(compatible_data_frame_nonames(x, y, convert));
+ return rcpp_result_gen;
END_RCPP
}
// compatible_data_frame
dplyr::BoolResult compatible_data_frame(DataFrame x, DataFrame y, bool ignore_col_order, bool convert);
-RcppExport SEXP dplyr_compatible_data_frame(SEXP xSEXP, SEXP ySEXP, SEXP ignore_col_orderSEXP, SEXP convertSEXP) {
+RcppExport SEXP _dplyr_compatible_data_frame(SEXP xSEXP, SEXP ySEXP, SEXP ignore_col_orderSEXP, SEXP convertSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< DataFrame >::type x(xSEXP);
Rcpp::traits::input_parameter< DataFrame >::type y(ySEXP);
Rcpp::traits::input_parameter< bool >::type ignore_col_order(ignore_col_orderSEXP);
Rcpp::traits::input_parameter< bool >::type convert(convertSEXP);
- __result = Rcpp::wrap(compatible_data_frame(x, y, ignore_col_order, convert));
- return __result;
+ rcpp_result_gen = Rcpp::wrap(compatible_data_frame(x, y, ignore_col_order, convert));
+ return rcpp_result_gen;
END_RCPP
}
// equal_data_frame
dplyr::BoolResult equal_data_frame(DataFrame x, DataFrame y, bool ignore_col_order, bool ignore_row_order, bool convert);
-RcppExport SEXP dplyr_equal_data_frame(SEXP xSEXP, SEXP ySEXP, SEXP ignore_col_orderSEXP, SEXP ignore_row_orderSEXP, SEXP convertSEXP) {
+RcppExport SEXP _dplyr_equal_data_frame(SEXP xSEXP, SEXP ySEXP, SEXP ignore_col_orderSEXP, SEXP ignore_row_orderSEXP, SEXP convertSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< DataFrame >::type x(xSEXP);
Rcpp::traits::input_parameter< DataFrame >::type y(ySEXP);
Rcpp::traits::input_parameter< bool >::type ignore_col_order(ignore_col_orderSEXP);
Rcpp::traits::input_parameter< bool >::type ignore_row_order(ignore_row_orderSEXP);
Rcpp::traits::input_parameter< bool >::type convert(convertSEXP);
- __result = Rcpp::wrap(equal_data_frame(x, y, ignore_col_order, ignore_row_order, convert));
- return __result;
-END_RCPP
-}
-// all_equal_data_frame
-dplyr::BoolResult all_equal_data_frame(List args, Environment env);
-RcppExport SEXP dplyr_all_equal_data_frame(SEXP argsSEXP, SEXP envSEXP) {
-BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
- Rcpp::traits::input_parameter< List >::type args(argsSEXP);
- Rcpp::traits::input_parameter< Environment >::type env(envSEXP);
- __result = Rcpp::wrap(all_equal_data_frame(args, env));
- return __result;
+ rcpp_result_gen = Rcpp::wrap(equal_data_frame(x, y, ignore_col_order, ignore_row_order, convert));
+ return rcpp_result_gen;
END_RCPP
}
// union_data_frame
DataFrame union_data_frame(DataFrame x, DataFrame y);
-RcppExport SEXP dplyr_union_data_frame(SEXP xSEXP, SEXP ySEXP) {
+RcppExport SEXP _dplyr_union_data_frame(SEXP xSEXP, SEXP ySEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< DataFrame >::type x(xSEXP);
Rcpp::traits::input_parameter< DataFrame >::type y(ySEXP);
- __result = Rcpp::wrap(union_data_frame(x, y));
- return __result;
+ rcpp_result_gen = Rcpp::wrap(union_data_frame(x, y));
+ return rcpp_result_gen;
END_RCPP
}
// intersect_data_frame
DataFrame intersect_data_frame(DataFrame x, DataFrame y);
-RcppExport SEXP dplyr_intersect_data_frame(SEXP xSEXP, SEXP ySEXP) {
+RcppExport SEXP _dplyr_intersect_data_frame(SEXP xSEXP, SEXP ySEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< DataFrame >::type x(xSEXP);
Rcpp::traits::input_parameter< DataFrame >::type y(ySEXP);
- __result = Rcpp::wrap(intersect_data_frame(x, y));
- return __result;
+ rcpp_result_gen = Rcpp::wrap(intersect_data_frame(x, y));
+ return rcpp_result_gen;
END_RCPP
}
// setdiff_data_frame
DataFrame setdiff_data_frame(DataFrame x, DataFrame y);
-RcppExport SEXP dplyr_setdiff_data_frame(SEXP xSEXP, SEXP ySEXP) {
-BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
- Rcpp::traits::input_parameter< DataFrame >::type x(xSEXP);
- Rcpp::traits::input_parameter< DataFrame >::type y(ySEXP);
- __result = Rcpp::wrap(setdiff_data_frame(x, y));
- return __result;
-END_RCPP
-}
-// match_data_frame
-IntegerVector match_data_frame(DataFrame x, DataFrame y);
-RcppExport SEXP dplyr_match_data_frame(SEXP xSEXP, SEXP ySEXP) {
+RcppExport SEXP _dplyr_setdiff_data_frame(SEXP xSEXP, SEXP ySEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< DataFrame >::type x(xSEXP);
Rcpp::traits::input_parameter< DataFrame >::type y(ySEXP);
- __result = Rcpp::wrap(match_data_frame(x, y));
- return __result;
-END_RCPP
-}
-// resolve_vars
-SEXP resolve_vars(List new_groups, CharacterVector names);
-RcppExport SEXP dplyr_resolve_vars(SEXP new_groupsSEXP, SEXP namesSEXP) {
-BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
- Rcpp::traits::input_parameter< List >::type new_groups(new_groupsSEXP);
- Rcpp::traits::input_parameter< CharacterVector >::type names(namesSEXP);
- __result = Rcpp::wrap(resolve_vars(new_groups, names));
- return __result;
-END_RCPP
-}
-// grouped_df_impl
-DataFrame grouped_df_impl(DataFrame data, ListOf<Symbol> symbols, bool drop);
-RcppExport SEXP dplyr_grouped_df_impl(SEXP dataSEXP, SEXP symbolsSEXP, SEXP dropSEXP) {
-BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
- Rcpp::traits::input_parameter< DataFrame >::type data(dataSEXP);
- Rcpp::traits::input_parameter< ListOf<Symbol> >::type symbols(symbolsSEXP);
- Rcpp::traits::input_parameter< bool >::type drop(dropSEXP);
- __result = Rcpp::wrap(grouped_df_impl(data, symbols, drop));
- return __result;
-END_RCPP
-}
-// grouped_df_adj_impl
-DataFrame grouped_df_adj_impl(DataFrame data, ListOf<Symbol> symbols, bool drop);
-RcppExport SEXP dplyr_grouped_df_adj_impl(SEXP dataSEXP, SEXP symbolsSEXP, SEXP dropSEXP) {
-BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
- Rcpp::traits::input_parameter< DataFrame >::type data(dataSEXP);
- Rcpp::traits::input_parameter< ListOf<Symbol> >::type symbols(symbolsSEXP);
- Rcpp::traits::input_parameter< bool >::type drop(dropSEXP);
- __result = Rcpp::wrap(grouped_df_adj_impl(data, symbols, drop));
- return __result;
+ rcpp_result_gen = Rcpp::wrap(setdiff_data_frame(x, y));
+ return rcpp_result_gen;
END_RCPP
}
// slice_impl
-SEXP slice_impl(DataFrame df, LazyDots dots);
-RcppExport SEXP dplyr_slice_impl(SEXP dfSEXP, SEXP dotsSEXP) {
-BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
- Rcpp::traits::input_parameter< DataFrame >::type df(dfSEXP);
- Rcpp::traits::input_parameter< LazyDots >::type dots(dotsSEXP);
- __result = Rcpp::wrap(slice_impl(df, dots));
- return __result;
-END_RCPP
-}
-// mutate_impl
-SEXP mutate_impl(DataFrame df, LazyDots dots);
-RcppExport SEXP dplyr_mutate_impl(SEXP dfSEXP, SEXP dotsSEXP) {
+SEXP slice_impl(DataFrame df, QuosureList dots);
+RcppExport SEXP _dplyr_slice_impl(SEXP dfSEXP, SEXP dotsSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< DataFrame >::type df(dfSEXP);
- Rcpp::traits::input_parameter< LazyDots >::type dots(dotsSEXP);
- __result = Rcpp::wrap(mutate_impl(df, dots));
- return __result;
-END_RCPP
-}
-// order_impl
-IntegerVector order_impl(List args, Environment env);
-RcppExport SEXP dplyr_order_impl(SEXP argsSEXP, SEXP envSEXP) {
-BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
- Rcpp::traits::input_parameter< List >::type args(argsSEXP);
- Rcpp::traits::input_parameter< Environment >::type env(envSEXP);
- __result = Rcpp::wrap(order_impl(args, env));
- return __result;
-END_RCPP
-}
-// sort_impl
-DataFrame sort_impl(DataFrame data);
-RcppExport SEXP dplyr_sort_impl(SEXP dataSEXP) {
-BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
- Rcpp::traits::input_parameter< DataFrame >::type data(dataSEXP);
- __result = Rcpp::wrap(sort_impl(data));
- return __result;
+ Rcpp::traits::input_parameter< QuosureList >::type dots(dotsSEXP);
+ rcpp_result_gen = Rcpp::wrap(slice_impl(df, dots));
+ return rcpp_result_gen;
END_RCPP
}
-// group_size_grouped_cpp
-IntegerVector group_size_grouped_cpp(GroupedDataFrame gdf);
-RcppExport SEXP dplyr_group_size_grouped_cpp(SEXP gdfSEXP) {
-BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
- Rcpp::traits::input_parameter< GroupedDataFrame >::type gdf(gdfSEXP);
- __result = Rcpp::wrap(group_size_grouped_cpp(gdf));
- return __result;
-END_RCPP
-}
-// n_distinct_multi
-SEXP n_distinct_multi(List variables, bool na_rm);
-RcppExport SEXP dplyr_n_distinct_multi(SEXP variablesSEXP, SEXP na_rmSEXP) {
-BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
- Rcpp::traits::input_parameter< List >::type variables(variablesSEXP);
- Rcpp::traits::input_parameter< bool >::type na_rm(na_rmSEXP);
- __result = Rcpp::wrap(n_distinct_multi(variables, na_rm));
- return __result;
-END_RCPP
-}
-// as_regular_df
-DataFrame as_regular_df(DataFrame df);
-RcppExport SEXP dplyr_as_regular_df(SEXP dfSEXP) {
-BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
- Rcpp::traits::input_parameter< DataFrame >::type df(dfSEXP);
- __result = Rcpp::wrap(as_regular_df(df));
- return __result;
-END_RCPP
-}
-// ungroup_grouped_df
-DataFrame ungroup_grouped_df(DataFrame df);
-RcppExport SEXP dplyr_ungroup_grouped_df(SEXP dfSEXP) {
-BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
- Rcpp::traits::input_parameter< DataFrame >::type df(dfSEXP);
- __result = Rcpp::wrap(ungroup_grouped_df(df));
- return __result;
-END_RCPP
-}
-// split_indices
-std::vector<std::vector<int> > split_indices(IntegerVector group, int groups);
-RcppExport SEXP dplyr_split_indices(SEXP groupSEXP, SEXP groupsSEXP) {
-BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
- Rcpp::traits::input_parameter< IntegerVector >::type group(groupSEXP);
- Rcpp::traits::input_parameter< int >::type groups(groupsSEXP);
- __result = Rcpp::wrap(split_indices(group, groups));
- return __result;
-END_RCPP
-}
-// gp
-unsigned short gp(SEXP x);
-RcppExport SEXP dplyr_gp(SEXP xSEXP) {
-BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
- Rcpp::traits::input_parameter< SEXP >::type x(xSEXP);
- __result = Rcpp::wrap(gp(x));
- return __result;
-END_RCPP
-}
-// filter_impl
-SEXP filter_impl(DataFrame df, LazyDots dots);
-RcppExport SEXP dplyr_filter_impl(SEXP dfSEXP, SEXP dotsSEXP) {
+// summarise_impl
+SEXP summarise_impl(DataFrame df, QuosureList dots);
+RcppExport SEXP _dplyr_summarise_impl(SEXP dfSEXP, SEXP dotsSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< DataFrame >::type df(dfSEXP);
- Rcpp::traits::input_parameter< LazyDots >::type dots(dotsSEXP);
- __result = Rcpp::wrap(filter_impl(df, dots));
- return __result;
+ Rcpp::traits::input_parameter< QuosureList >::type dots(dotsSEXP);
+ rcpp_result_gen = Rcpp::wrap(summarise_impl(df, dots));
+ return rcpp_result_gen;
END_RCPP
}
-// grouped_indices_grouped_df_impl
-IntegerVector grouped_indices_grouped_df_impl(GroupedDataFrame gdf);
-RcppExport SEXP dplyr_grouped_indices_grouped_df_impl(SEXP gdfSEXP) {
+// test_comparisons
+LogicalVector test_comparisons();
+RcppExport SEXP _dplyr_test_comparisons() {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
- Rcpp::traits::input_parameter< GroupedDataFrame >::type gdf(gdfSEXP);
- __result = Rcpp::wrap(grouped_indices_grouped_df_impl(gdf));
- return __result;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
+ rcpp_result_gen = Rcpp::wrap(test_comparisons());
+ return rcpp_result_gen;
END_RCPP
}
-// grouped_indices_impl
-IntegerVector grouped_indices_impl(DataFrame data, ListOf<Symbol> symbols);
-RcppExport SEXP dplyr_grouped_indices_impl(SEXP dataSEXP, SEXP symbolsSEXP) {
+// test_matches
+List test_matches();
+RcppExport SEXP _dplyr_test_matches() {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
- Rcpp::traits::input_parameter< DataFrame >::type data(dataSEXP);
- Rcpp::traits::input_parameter< ListOf<Symbol> >::type symbols(symbolsSEXP);
- __result = Rcpp::wrap(grouped_indices_impl(data, symbols));
- return __result;
-END_RCPP
-}
-// select_impl
-DataFrame select_impl(DataFrame df, CharacterVector vars);
-RcppExport SEXP dplyr_select_impl(SEXP dfSEXP, SEXP varsSEXP) {
-BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
- Rcpp::traits::input_parameter< DataFrame >::type df(dfSEXP);
- Rcpp::traits::input_parameter< CharacterVector >::type vars(varsSEXP);
- __result = Rcpp::wrap(select_impl(df, vars));
- return __result;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
+ rcpp_result_gen = Rcpp::wrap(test_matches());
+ return rcpp_result_gen;
END_RCPP
}
-// strings_addresses
-CharacterVector strings_addresses(CharacterVector s);
-RcppExport SEXP dplyr_strings_addresses(SEXP sSEXP) {
+// test_length_wrap
+LogicalVector test_length_wrap();
+RcppExport SEXP _dplyr_test_length_wrap() {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
- Rcpp::traits::input_parameter< CharacterVector >::type s(sSEXP);
- __result = Rcpp::wrap(strings_addresses(s));
- return __result;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
+ rcpp_result_gen = Rcpp::wrap(test_length_wrap());
+ return rcpp_result_gen;
END_RCPP
}
-// summarise_impl
-SEXP summarise_impl(DataFrame df, LazyDots dots);
-RcppExport SEXP dplyr_summarise_impl(SEXP dfSEXP, SEXP dotsSEXP) {
+// assert_all_white_list
+void assert_all_white_list(const DataFrame& data);
+RcppExport SEXP _dplyr_assert_all_white_list(SEXP dataSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
- Rcpp::traits::input_parameter< DataFrame >::type df(dfSEXP);
- Rcpp::traits::input_parameter< LazyDots >::type dots(dotsSEXP);
- __result = Rcpp::wrap(summarise_impl(df, dots));
- return __result;
+ Rcpp::RNGScope rcpp_rngScope_gen;
+ Rcpp::traits::input_parameter< const DataFrame& >::type data(dataSEXP);
+ assert_all_white_list(data);
+ return R_NilValue;
END_RCPP
}
-// test_comparisons
-LogicalVector test_comparisons();
-RcppExport SEXP dplyr_test_comparisons() {
+// shallow_copy
+SEXP shallow_copy(const List& data);
+RcppExport SEXP _dplyr_shallow_copy(SEXP dataSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
- __result = Rcpp::wrap(test_comparisons());
- return __result;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
+ Rcpp::traits::input_parameter< const List& >::type data(dataSEXP);
+ rcpp_result_gen = Rcpp::wrap(shallow_copy(data));
+ return rcpp_result_gen;
END_RCPP
}
// cumall
LogicalVector cumall(LogicalVector x);
-RcppExport SEXP dplyr_cumall(SEXP xSEXP) {
+RcppExport SEXP _dplyr_cumall(SEXP xSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< LogicalVector >::type x(xSEXP);
- __result = Rcpp::wrap(cumall(x));
- return __result;
+ rcpp_result_gen = Rcpp::wrap(cumall(x));
+ return rcpp_result_gen;
END_RCPP
}
// cumany
LogicalVector cumany(LogicalVector x);
-RcppExport SEXP dplyr_cumany(SEXP xSEXP) {
+RcppExport SEXP _dplyr_cumany(SEXP xSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< LogicalVector >::type x(xSEXP);
- __result = Rcpp::wrap(cumany(x));
- return __result;
+ rcpp_result_gen = Rcpp::wrap(cumany(x));
+ return rcpp_result_gen;
END_RCPP
}
// cummean
NumericVector cummean(NumericVector x);
-RcppExport SEXP dplyr_cummean(SEXP xSEXP) {
+RcppExport SEXP _dplyr_cummean(SEXP xSEXP) {
BEGIN_RCPP
- Rcpp::RObject __result;
- Rcpp::RNGScope __rngScope;
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< NumericVector >::type x(xSEXP);
- __result = Rcpp::wrap(cummean(x));
- return __result;
+ rcpp_result_gen = Rcpp::wrap(cummean(x));
+ return rcpp_result_gen;
END_RCPP
}
+
+// validate (ensure exported C++ functions exist before calling them)
+static int _dplyr_RcppExport_validate(const char* sig) {
+ static std::set<std::string> signatures;
+ if (signatures.empty()) {
+ signatures.insert("SEXP(*get_date_classes)()");
+ signatures.insert("SEXP(*get_time_classes)()");
+ signatures.insert("DataFrame(*build_index_cpp)(DataFrame)");
+ }
+ return signatures.find(sig) != signatures.end();
+}
+
+// registerCCallable (register entry points for exported C++ functions)
+RcppExport SEXP _dplyr_RcppExport_registerCCallable() {
+ R_RegisterCCallable("dplyr", "_dplyr_get_date_classes", (DL_FUNC)_dplyr_get_date_classes_try);
+ R_RegisterCCallable("dplyr", "_dplyr_get_time_classes", (DL_FUNC)_dplyr_get_time_classes_try);
+ R_RegisterCCallable("dplyr", "_dplyr_build_index_cpp", (DL_FUNC)_dplyr_build_index_cpp_try);
+ R_RegisterCCallable("dplyr", "_dplyr_RcppExport_validate", (DL_FUNC)_dplyr_RcppExport_validate);
+ return R_NilValue;
+}
+
+static const R_CallMethodDef CallEntries[] = {
+ {"_dplyr_loc", (DL_FUNC) &_dplyr_loc, 1},
+ {"_dplyr_dfloc", (DL_FUNC) &_dplyr_dfloc, 1},
+ {"_dplyr_plfloc", (DL_FUNC) &_dplyr_plfloc, 1},
+ {"_dplyr_strings_addresses", (DL_FUNC) &_dplyr_strings_addresses, 1},
+ {"_dplyr_gp", (DL_FUNC) &_dplyr_gp, 1},
+ {"_dplyr_init_logging", (DL_FUNC) &_dplyr_init_logging, 1},
+ {"_dplyr_arrange_impl", (DL_FUNC) &_dplyr_arrange_impl, 2},
+ {"_dplyr_between", (DL_FUNC) &_dplyr_between, 3},
+ {"_dplyr_flatten_bindable", (DL_FUNC) &_dplyr_flatten_bindable, 1},
+ {"_dplyr_bind_rows_", (DL_FUNC) &_dplyr_bind_rows_, 2},
+ {"_dplyr_cbind_all", (DL_FUNC) &_dplyr_cbind_all, 1},
+ {"_dplyr_combine_all", (DL_FUNC) &_dplyr_combine_all, 1},
+ {"_dplyr_combine_vars", (DL_FUNC) &_dplyr_combine_vars, 2},
+ {"_dplyr_distinct_impl", (DL_FUNC) &_dplyr_distinct_impl, 3},
+ {"_dplyr_n_distinct_multi", (DL_FUNC) &_dplyr_n_distinct_multi, 2},
+ {"_dplyr_filter_impl", (DL_FUNC) &_dplyr_filter_impl, 2},
+ {"_dplyr_grouped_df_impl", (DL_FUNC) &_dplyr_grouped_df_impl, 3},
+ {"_dplyr_as_regular_df", (DL_FUNC) &_dplyr_as_regular_df, 1},
+ {"_dplyr_ungroup_grouped_df", (DL_FUNC) &_dplyr_ungroup_grouped_df, 1},
+ {"_dplyr_test_grouped_df", (DL_FUNC) &_dplyr_test_grouped_df, 1},
+ {"_dplyr_grouped_indices_grouped_df_impl", (DL_FUNC) &_dplyr_grouped_indices_grouped_df_impl, 1},
+ {"_dplyr_group_size_grouped_cpp", (DL_FUNC) &_dplyr_group_size_grouped_cpp, 1},
+ {"_dplyr_get_date_classes", (DL_FUNC) &_dplyr_get_date_classes, 0},
+ {"_dplyr_get_time_classes", (DL_FUNC) &_dplyr_get_time_classes, 0},
+ {"_dplyr_build_index_cpp", (DL_FUNC) &_dplyr_build_index_cpp, 1},
+ {"_dplyr_semi_join_impl", (DL_FUNC) &_dplyr_semi_join_impl, 5},
+ {"_dplyr_anti_join_impl", (DL_FUNC) &_dplyr_anti_join_impl, 5},
+ {"_dplyr_inner_join_impl", (DL_FUNC) &_dplyr_inner_join_impl, 7},
+ {"_dplyr_left_join_impl", (DL_FUNC) &_dplyr_left_join_impl, 7},
+ {"_dplyr_right_join_impl", (DL_FUNC) &_dplyr_right_join_impl, 7},
+ {"_dplyr_full_join_impl", (DL_FUNC) &_dplyr_full_join_impl, 7},
+ {"_dplyr_mutate_impl", (DL_FUNC) &_dplyr_mutate_impl, 2},
+ {"_dplyr_select_impl", (DL_FUNC) &_dplyr_select_impl, 2},
+ {"_dplyr_compatible_data_frame_nonames", (DL_FUNC) &_dplyr_compatible_data_frame_nonames, 3},
+ {"_dplyr_compatible_data_frame", (DL_FUNC) &_dplyr_compatible_data_frame, 4},
+ {"_dplyr_equal_data_frame", (DL_FUNC) &_dplyr_equal_data_frame, 5},
+ {"_dplyr_union_data_frame", (DL_FUNC) &_dplyr_union_data_frame, 2},
+ {"_dplyr_intersect_data_frame", (DL_FUNC) &_dplyr_intersect_data_frame, 2},
+ {"_dplyr_setdiff_data_frame", (DL_FUNC) &_dplyr_setdiff_data_frame, 2},
+ {"_dplyr_slice_impl", (DL_FUNC) &_dplyr_slice_impl, 2},
+ {"_dplyr_summarise_impl", (DL_FUNC) &_dplyr_summarise_impl, 2},
+ {"_dplyr_test_comparisons", (DL_FUNC) &_dplyr_test_comparisons, 0},
+ {"_dplyr_test_matches", (DL_FUNC) &_dplyr_test_matches, 0},
+ {"_dplyr_test_length_wrap", (DL_FUNC) &_dplyr_test_length_wrap, 0},
+ {"_dplyr_assert_all_white_list", (DL_FUNC) &_dplyr_assert_all_white_list, 1},
+ {"_dplyr_shallow_copy", (DL_FUNC) &_dplyr_shallow_copy, 1},
+ {"_dplyr_cumall", (DL_FUNC) &_dplyr_cumall, 1},
+ {"_dplyr_cumany", (DL_FUNC) &_dplyr_cumany, 1},
+ {"_dplyr_cummean", (DL_FUNC) &_dplyr_cummean, 1},
+ {"_dplyr_RcppExport_registerCCallable", (DL_FUNC) &_dplyr_RcppExport_registerCCallable, 0},
+ {NULL, NULL, 0}
+};
+
+RcppExport void R_init_dplyr(DllInfo *dll) {
+ R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
+ R_useDynamicSymbols(dll, FALSE);
+}
diff --git a/src/address.cpp b/src/address.cpp
index d783348..bc86c2b 100644
--- a/src/address.cpp
+++ b/src/address.cpp
@@ -1,10 +1,14 @@
-#include <Rcpp.h>
+#include "pch.h"
+#include <dplyr/main.h>
+
+#include <tools/encoding.h>
+
using namespace Rcpp;
-const char* address(SEXP x){
- static char buffer[20] ;
- snprintf( buffer, 20, "%p", reinterpret_cast<void*>(x) ) ;
- return (const char*)buffer ;
+const char* address(SEXP x) {
+ static char buffer[20];
+ snprintf(buffer, 20, "%p", reinterpret_cast<void*>(x));
+ return (const char*)buffer;
}
// [[Rcpp::export]]
@@ -15,28 +19,67 @@ CharacterVector loc(RObject data) {
}
// [[Rcpp::export]]
-CharacterVector dfloc(List df){
- int n = df.size() ;
+CharacterVector dfloc(List df) {
+ int n = df.size();
CharacterVector pointers(n);
- for( int i=0; i<n; i++) {
- pointers[i] = address(df[i]) ;
+ for (int i = 0; i < n; i++) {
+ pointers[i] = address(df[i]);
}
- pointers.names() = df.names() ;
- return pointers ;
+ pointers.names() = df.names();
+ return pointers;
}
// [[Rcpp::export]]
-CharacterVector plfloc(Pairlist data){
- int n = data.size() ;
- CharacterVector pointers(n), names(n) ;
- SEXP p = data ;
- int i=0 ;
- while( ! Rf_isNull(p) ){
- pointers[i] = address(CAR(p)) ;
- names[i] = PRINTNAME(TAG(p)) ;
- p = CDR(p) ;
- i++ ;
+CharacterVector plfloc(Pairlist data) {
+ int n = data.size();
+ CharacterVector pointers(n), names(n);
+ SEXP p = data;
+ int i = 0;
+ while (! Rf_isNull(p)) {
+ pointers[i] = address(CAR(p));
+ names[i] = PRINTNAME(TAG(p));
+ p = CDR(p);
+ i++;
}
- pointers.names() = names ;
+ pointers.names() = names;
return pointers;
}
+
+// [[Rcpp::export]]
+CharacterVector strings_addresses(CharacterVector s) {
+ static char buffer[20];
+ int n = s.size();
+
+ CharacterVector res(n);
+ for (int i = 0; i < n; i++) {
+ SEXP x = s[i];
+ snprintf(buffer, 20, "%p", reinterpret_cast<void*>(x));
+ res[i] = buffer;
+ }
+ res.names() = s;
+
+ return res;
+}
+
+// simple internal debugging function to access the gp part of the SEXP
+// only meant for internal use in dplyr debugging
+
+// [[Rcpp::export]]
+unsigned short gp(SEXP x) {
+ return reinterpret_cast<sxpinfo_struct*>(x)->gp;
+}
+
+
+//' Enable internal logging
+//'
+//' Log entries, depending on the log level, will be printed to the standard
+//' error stream.
+//'
+//' @param log_level A character value, one of "WARN", "INFO", "DEBUG", "VERB",
+//' or "NONE".
+//'
+//' @keywords internal
+// [[Rcpp::export]]
+void init_logging(const std::string& log_level) {
+ plog::init_r(log_level);
+}
diff --git a/src/api.cpp b/src/api.cpp
index 44d2c9a..1c9e4b0 100644
--- a/src/api.cpp
+++ b/src/api.cpp
@@ -1,350 +1,164 @@
-#include <dplyr.h>
+#include "pch.h"
+#include <dplyr/main.h>
-namespace dplyr{
+#include <boost/scoped_ptr.hpp>
- DataFrameVisitors::DataFrameVisitors( const Rcpp::DataFrame& data_) :
- data(data_),
- visitors(),
- visitor_names(data.names()),
- nvisitors(visitor_names.size())
- {
+#include <tools/hash.h>
+#include <tools/match.h>
- for( int i=0; i<nvisitors; i++){
- VectorVisitor* v = visitor( data[i] ) ;
- visitors.push_back(v) ;
- }
- }
-
- DataFrameVisitors::DataFrameVisitors( const Rcpp::DataFrame& data_, const Rcpp::CharacterVector& names ) :
- data(data_),
- visitors(),
- visitor_names(names),
- nvisitors(visitor_names.size())
- {
-
- std::string name ;
- int n = names.size() ;
- IntegerVector indices = r_match( names, RCPP_GET_NAMES(data) ) ;
-
- for( int i=0; i<n; i++){
- if( indices[i] == NA_INTEGER){
- name = (String)names[i] ;
- stop( "unknown column '%s' ", name ) ;
- }
- SEXP column = data[indices[i]-1];
- visitors.push_back(visitor( column )) ;
- }
+#include <dplyr/CharacterVectorOrderer.h>
- }
+#include <dplyr/tbl_cpp.h>
+#include <dplyr/visitor_impl.h>
- void DataFrameVisitors::structure( List& x, int nrows, CharacterVector classes ) const {
- x.attr( "class" ) = classes ;
- set_rownames(x, nrows) ;
- x.names() = visitor_names ;
- SEXP vars = data.attr( "vars" ) ;
- if( !Rf_isNull(vars) )
- x.attr( "vars" ) = vars ;
- }
+#include <dplyr/JoinVisitor.h>
- inline String comma_collapse( SEXP names ){
- return Language( "paste", names, _["collapse"] = ", " ).fast_eval() ;
- }
+#include <dplyr/Result/Result.h>
- DataFrameJoinVisitors::DataFrameJoinVisitors(const Rcpp::DataFrame& left_, const Rcpp::DataFrame& right_, Rcpp::CharacterVector names_left, Rcpp::CharacterVector names_right, bool warn_ ) :
- left(left_), right(right_),
- visitor_names_left(names_left),
- visitor_names_right(names_right),
- nvisitors(names_left.size()),
- visitors(nvisitors),
- warn(warn_)
- {
- std::string name_left, name_right ;
-
- IntegerVector indices_left = r_match( names_left, RCPP_GET_NAMES(left) ) ;
- IntegerVector indices_right = r_match( names_right, RCPP_GET_NAMES(right) ) ;
-
- for( int i=0; i<nvisitors; i++){
- name_left = names_left[i] ;
- name_right = names_right[i] ;
-
- if( indices_left[i] == NA_INTEGER ){
- stop( "'%s' column not found in lhs, cannot join", name_left ) ;
- }
- if( indices_right[i] == NA_INTEGER ){
- stop( "'%s' column not found in rhs, cannot join", name_right ) ;
- }
-
- visitors[i] = join_visitor( left[indices_left[i]-1], right[indices_right[i]-1], name_left, name_right, warn ) ;
- }
- }
+#include <dplyr/DataFrameJoinVisitors.h>
- Symbol extract_column( SEXP arg, const Environment& env ){
- RObject value ;
- if( TYPEOF(arg) == LANGSXP && CAR(arg) == Rf_install("~") ){
- if( Rf_length(arg) != 2 || TYPEOF(CADR(arg)) != SYMSXP )
- stop( "unhandled formula in column" ) ;
- value = CharacterVector::create( PRINTNAME(CADR(arg)) ) ;
- } else {
- value = Rcpp_eval(arg, env) ;
- }
- if( is<Symbol>(value) ){
- value = CharacterVector::create(PRINTNAME(value)) ;
- }
- if( !is<String>(value) ){
- stop("column must return a single string") ;
- }
- Symbol res(STRING_ELT(value,0)) ;
- return res ;
- }
-
- Symbol get_column(SEXP arg, const Environment& env, const LazySubsets& subsets ){
- Symbol res = extract_column(arg, env) ;
- if( !subsets.count(res) ){
- stop("result of column() expands to a symbol that is not a variable from the data: %s", CHAR(PRINTNAME(res)) ) ;
- }
- return res ;
- }
-
- void CallProxy::set_call( SEXP call_ ){
- proxies.clear() ;
- call = call_ ;
- if( TYPEOF(call) == LANGSXP ) traverse_call(call) ;
- }
+#include <dplyr/bad.h>
- SEXP CallProxy::eval(){
- if( TYPEOF(call) == LANGSXP ){
-
- if( can_simplify(call) ){
- SlicingIndex indices(0,subsets.nrows()) ;
- while(simplified(indices)) ;
- set_call(call) ;
- }
-
- int n = proxies.size() ;
- for( int i=0; i<n; i++){
- proxies[i].set( subsets[proxies[i].symbol] ) ;
- }
- return call.eval(env) ;
- } else if( TYPEOF(call) == SYMSXP) {
- // SYMSXP
- if( subsets.count(call) ) return subsets.get_variable(call) ;
- return call.eval(env) ;
- }
- return call ;
- }
+namespace dplyr {
- bool CallProxy::simplified(const SlicingIndex& indices){
- // initial
- if( TYPEOF(call) == LANGSXP ){
- boost::scoped_ptr<Result> res( get_handler(call, subsets, env) );
+DataFrameVisitors::DataFrameVisitors(const Rcpp::DataFrame& data_) :
+ data(data_),
+ visitors(),
+ visitor_names(data.names()),
+ nvisitors(visitor_names.size())
+{
- if( res ){
- // replace the call by the result of process
- call = res->process(indices) ;
+ for (int i = 0; i < nvisitors; i++) {
+ VectorVisitor* v = visitor(data[i]);
+ visitors.push_back(v);
+ }
+}
- // no need to go any further, we simplified the top level
- return true ;
- }
+DataFrameVisitors::DataFrameVisitors(const DataFrame& data_, const SymbolVector& names) :
+ data(data_),
+ visitors(),
+ visitor_names(names),
+ nvisitors(visitor_names.size())
+{
- return replace( CDR(call), indices ) ;
+ int n = names.size();
+ IntegerVector indices = names.match_in_table(data.names());
- }
- return false ;
+ for (int i = 0; i < n; i++) {
+ if (indices[i] == NA_INTEGER) {
+ bad_col(names[i], "is unknown");
}
+ SEXP column = data[indices[i] - 1];
+ visitors.push_back(visitor(column));
+ }
- bool CallProxy::replace( SEXP p, const SlicingIndex& indices ){
-
- SEXP obj = CAR(p) ;
-
- if( TYPEOF(obj) == LANGSXP ){
- boost::scoped_ptr<Result> res( get_handler(obj, subsets, env) );
- if(res){
- SETCAR(p, res->process(indices) ) ;
- return true ;
- }
-
- if( replace( CDR(obj), indices ) ) return true ;
- }
+}
- if( TYPEOF(p) == LISTSXP ){
- return replace( CDR(p), indices ) ;
- }
+void DataFrameVisitors::structure(List& x, int nrows, CharacterVector classes) const {
+ set_class(x, classes);
+ set_rownames(x, nrows);
+ x.names() = visitor_names;
+ copy_vars(x, data);
+}
- return false ;
+DataFrameJoinVisitors::DataFrameJoinVisitors(const DataFrame& left_, const DataFrame& right_, const SymbolVector& names_left, const SymbolVector& names_right, bool warn_, bool na_match) :
+ left(left_), right(right_),
+ visitor_names_left(names_left),
+ visitor_names_right(names_right),
+ nvisitors(names_left.size()),
+ visitors(nvisitors),
+ warn(warn_)
+{
+ IntegerVector indices_left = names_left.match_in_table(RCPP_GET_NAMES(left));
+ IntegerVector indices_right = names_right.match_in_table(RCPP_GET_NAMES(right));
+
+ for (int i = 0; i < nvisitors; i++) {
+ const SymbolString& name_left = names_left[i];
+ const SymbolString& name_right = names_right[i];
+
+ if (indices_left[i] == NA_INTEGER) {
+ stop("'%s' column not found in lhs, cannot join", name_left.get_utf8_cstring());
}
-
- void CallProxy::traverse_call( SEXP obj ){
-
- if( TYPEOF(obj) == LANGSXP && CAR(obj) == Rf_install("local") ) return ;
-
- if( TYPEOF(obj) == LANGSXP && CAR(obj) == Rf_install("global") ){
- SEXP symb = CADR(obj) ;
- if( TYPEOF(symb) != SYMSXP ) stop( "global only handles symbols" ) ;
- SEXP res = env.find(CHAR(PRINTNAME(symb))) ;
- call = res ;
- return ;
- }
-
- if( TYPEOF(obj) == LANGSXP && CAR(obj) == Rf_install("column") ){
- call = get_column(CADR(obj), env, subsets) ;
- return ;
- }
-
- if( ! Rf_isNull(obj) ){
- SEXP head = CAR(obj) ;
- switch( TYPEOF( head ) ){
- case LANGSXP:
- if( CAR(head) == Rf_install("global") ){
- SEXP symb = CADR(head) ;
- if( TYPEOF(symb) != SYMSXP ) stop( "global only handles symbols" ) ;
- SEXP res = env.find( CHAR(PRINTNAME(symb)) ) ;
-
- SETCAR(obj, res) ;
- SET_TYPEOF(obj, LISTSXP) ;
-
- break ;
- }
- if( CAR(head) == Rf_install("column")){
- Symbol column = get_column( CADR(head), env, subsets) ;
- SETCAR(obj, column ) ;
- head = CAR(obj) ;
- proxies.push_back( CallElementProxy( head, obj ) );
-
- break ;
- }
- if( CAR(head) == Rf_install("~")) break ;
- if( CAR(head) == Rf_install("order_by") ) break ;
- if( CAR(head) == Rf_install("function") ) break ;
- if( CAR(head) == Rf_install("local") ) return ;
- if( CAR(head) == Rf_install("<-") ){
- stop( "assignments are forbidden" ) ;
- }
- if( Rf_length(head) == 3 ){
- SEXP symb = CAR(head) ;
- if( symb == R_DollarSymbol || symb == Rf_install("@") || symb == Rf_install("::") || symb == Rf_install(":::") ){
-
- // Rprintf( "CADR(obj) = " ) ;
- // Rf_PrintValue( CADR(obj) ) ;
-
- // for things like : foo( bar = bling )$bla
- // so that `foo( bar = bling )` gets processed
- if( TYPEOF(CADR(head)) == LANGSXP ){
- traverse_call( CDR(head) ) ;
- }
-
- // deal with foo$bar( bla = boom )
- if( TYPEOF(CADDR(head)) == LANGSXP ){
- traverse_call( CDDR(head) ) ;
- }
-
- break ;
- } else {
- traverse_call( CDR(head) ) ;
- }
- } else {
- traverse_call( CDR(head) ) ;
- }
-
- break ;
- case LISTSXP:
- traverse_call( head ) ;
- traverse_call( CDR(head) ) ;
- break ;
- case SYMSXP:
- if( TYPEOF(obj) != LANGSXP ){
- if( ! subsets.count(head) ){
- if( head == R_MissingArg ) break ;
- if( head == Rf_install(".") ) break ;
-
- // in the Environment -> resolve
- try{
- Shield<SEXP> x( env.find( CHAR(PRINTNAME(head)) ) ) ;
- SETCAR( obj, x );
- } catch( ...){
- // what happens when not found in environment
- }
-
- } else {
- // in the data frame
- proxies.push_back( CallElementProxy( head, obj ) );
- }
- break ;
- }
- }
- traverse_call( CDR(obj) ) ;
- }
+ if (indices_right[i] == NA_INTEGER) {
+ stop("'%s' column not found in rhs, cannot join", name_right.get_utf8_cstring());
}
- CharacterVectorOrderer::CharacterVectorOrderer( const CharacterVector& data_ ) :
- data(data_),
- set(),
- orders(no_init(data.size()))
- {
- int n = data.size() ;
- if( n == 0 ) return ;
-
- // 1 - gather unique SEXP pointers from data
- SEXP* p_data = Rcpp::internal::r_vector_start<STRSXP>(data);
- SEXP previous = *p_data++ ;
- set.insert( previous ) ;
- for( int i=1; i<n; i++, p_data++){
- SEXP s = *p_data ;
-
- // we've just seen this string, keep going
- if( s == previous ) continue ;
-
- // is this string in the set already
- set.insert(s) ;
- previous = s ;
- }
-
- // retrieve unique strings from the set
- int n_uniques = set.size() ;
- CharacterVector uniques( set.begin(), set.end() ) ;
- CharacterVector s_uniques = Language( "sort", uniques ).fast_eval() ;
-
- // order the uniques with a callback to R
- IntegerVector o = r_match(uniques, s_uniques ) ;
-
- // combine uniques and o into a hash map for fast retrieval
- dplyr_hash_map<SEXP,int> map ;
- for( int i=0; i<n_uniques; i++){
- map.insert( std::make_pair(uniques[i], o[i] ) ) ;
- }
-
- // grab min ranks
- p_data = Rcpp::internal::r_vector_start<STRSXP>(data);
- previous = *p_data++ ;
-
- int o_pos ;
- orders[0] = o_pos = map.find(previous)->second ;
-
- for( int i=1; i<n; i++, p_data++){
- SEXP s = *p_data;
- if( s == previous ) {
- orders[i] = o_pos ;
- continue ;
- }
- previous = s ;
- orders[i] = o_pos = map.find(s)->second ;
- }
+ visitors[i] =
+ join_visitor(
+ Column(left[indices_left[i] - 1], name_left),
+ Column(right[indices_right[i] - 1], name_right),
+ warn, na_match
+ );
+ }
+}
+CharacterVectorOrderer::CharacterVectorOrderer(const CharacterVector& data) :
+ orders(no_init(data.size()))
+{
+ int n = data.size();
+ if (n == 0) return;
+
+ dplyr_hash_set<SEXP> set(n);
+
+ // 1 - gather unique SEXP pointers from data
+ SEXP* p_data = Rcpp::internal::r_vector_start<STRSXP>(data);
+ SEXP previous = *p_data++;
+ set.insert(previous);
+ for (int i = 1; i < n; i++, p_data++) {
+ SEXP s = *p_data;
+
+ // we've just seen this string, keep going
+ if (s == previous) continue;
+
+ // is this string in the set already
+ set.insert(s);
+ previous = s;
+ }
+
+ // retrieve unique strings from the set
+ int n_uniques = set.size();
+ LOG_VERBOSE << "Sorting " << n_uniques << " unique character elements";
+
+ CharacterVector uniques(set.begin(), set.end());
+ CharacterVector s_uniques = Language("sort", uniques).fast_eval();
+
+ // order the uniques with a callback to R
+ IntegerVector o = r_match(uniques, s_uniques);
+
+ // combine uniques and o into a hash map for fast retrieval
+ dplyr_hash_map<SEXP, int> map(n_uniques);
+ for (int i = 0; i < n_uniques; i++) {
+ map.insert(std::make_pair(uniques[i], o[i]));
+ }
+
+ // grab min ranks
+ p_data = Rcpp::internal::r_vector_start<STRSXP>(data);
+ previous = *p_data++;
+
+ int o_pos;
+ orders[0] = o_pos = map.find(previous)->second;
+
+ for (int i = 1; i < n; ++i, ++p_data) {
+ SEXP s = *p_data;
+ if (s == previous) {
+ orders[i] = o_pos;
+ continue;
}
+ previous = s;
+ orders[i] = o_pos = map.find(s)->second;
+ }
- CharacterVector get_uniques( const CharacterVector& left, const CharacterVector& right){
- int nleft = left.size(), nright = right.size() ;
- int n = nleft + nright ;
+}
- CharacterVector big = no_init(n) ;
- CharacterVector::iterator it = big.begin() ;
- std::copy( left.begin(), left.end(), it ) ;
- std::copy( right.begin(), right.end(), it + nleft ) ;
- return Language( "unique", big ).fast_eval() ;
- }
+CharacterVector get_uniques(const CharacterVector& left, const CharacterVector& right) {
+ int nleft = left.size(), nright = right.size();
+ int n = nleft + nright;
+ CharacterVector big = no_init(n);
+ CharacterVector::iterator it = big.begin();
+ std::copy(left.begin(), left.end(), it);
+ std::copy(right.begin(), right.end(), it + nleft);
+ return Language("unique", big).fast_eval();
}
-// [[Rcpp::export]]
-IntegerVector rank_strings( CharacterVector s ){
- return dplyr::CharacterVectorOrderer(s).get() ;
}
diff --git a/src/arrange.cpp b/src/arrange.cpp
index bf67548..bc8295a 100644
--- a/src/arrange.cpp
+++ b/src/arrange.cpp
@@ -1,65 +1,86 @@
-#include <dplyr.h>
+#include "pch.h"
+#include <dplyr/main.h>
-using namespace Rcpp ;
-using namespace dplyr ;
+#include <tools/Quosure.h>
+
+#include <dplyr/white_list.h>
+
+#include <dplyr/GroupedDataFrame.h>
+
+#include <dplyr/Order.h>
+
+#include <dplyr/Result/CallProxy.h>
+
+#include <dplyr/Groups.h>
+#include <dplyr/bad.h>
+
+using namespace Rcpp;
+using namespace dplyr;
// [[Rcpp::export]]
-List arrange_impl( DataFrame data, LazyDots dots ){
- if( data.size() == 0 ) return data ;
- check_valid_colnames(data) ;
- assert_all_white_list(data) ;
-
- if( dots.size() == 0 || data.nrows() == 0) return data ;
-
- int nargs = dots.size() ;
- List variables(nargs) ;
- LogicalVector ascending(nargs) ;
-
- for(int i=0; i<nargs; i++){
- const Lazy& lazy = dots[i] ;
-
- Shield<SEXP> call_( lazy.expr() ) ;
- SEXP call = call_ ;
- bool is_desc = TYPEOF(call) == LANGSXP && Rf_install("desc") == CAR(call) ;
-
- CallProxy call_proxy(is_desc ? CADR(call) : call, data, lazy.env()) ;
-
- Shield<SEXP> v(call_proxy.eval()) ;
- if( !white_list(v) ){
- stop( "cannot arrange column of class '%s'", get_single_class(v) ) ;
- }
-
- if( Rf_inherits(v, "data.frame" ) ){
- DataFrame df(v) ;
- int nr = df.nrows() ;
- if( nr != data.nrows() ){
- stop( "data frame column with incompatible number of rows (%d), expecting : %d", nr, data.nrows() );
- }
- } else if( Rf_isMatrix(v) ) {
- stop( "can't arrange by a matrix" ) ;
- } else {
- if( Rf_length(v) != data.nrows() ){
- stop( "incorrect size (%d), expecting : %d", Rf_length(v), data.nrows() ) ;
- }
- }
- variables[i] = v ;
- ascending[i] = !is_desc ;
+List arrange_impl(DataFrame data, QuosureList quosures) {
+ if (data.size() == 0 || data.nrows() == 0)
+ return data;
+
+ int nargs = quosures.size();
+ if (nargs == 0)
+ return data;
+
+ check_valid_colnames(data);
+ assert_all_white_list(data);
+
+ List variables(nargs);
+ LogicalVector ascending(nargs);
+
+ for (int i = 0; i < nargs; i++) {
+ const NamedQuosure& quosure = quosures[i];
+
+ Shield<SEXP> call_(quosure.expr());
+ SEXP call = call_;
+ bool is_desc = TYPEOF(call) == LANGSXP && Rf_install("desc") == CAR(call);
+
+ CallProxy call_proxy(is_desc ? CADR(call) : call, data, quosure.env());
+
+ Shield<SEXP> v(call_proxy.eval());
+ if (!white_list(v)) {
+ stop("cannot arrange column of class '%s' at position %d", get_single_class(v), i + 1);
}
- OrderVisitors o(variables, ascending, nargs) ;
- IntegerVector index = o.apply() ;
-
- DataFrameSubsetVisitors visitors( data, data.names() ) ;
- List res = visitors.subset(index, data.attr("class") ) ;
-
- if( is<GroupedDataFrame>(data) ){
- // so that all attributes are recalculated (indices ... )
- // see the lazyness feature in GroupedDataFrame
- // if we don't do that, we get the values of the un-arranged data
- // set for free from subset (#1064)
- res.attr("labels") = R_NilValue ;
- res.attr( "vars" ) = data.attr("vars" ) ;
- return GroupedDataFrame(res).data() ;
+
+ if (Rf_inherits(v, "data.frame")) {
+ DataFrame df(v);
+ int nr = df.nrows();
+ if (nr != data.nrows()) {
+ stop("data frame column with incompatible number of rows (%d), expecting : %d", nr, data.nrows());
+ }
+ } else if (Rf_isMatrix(v)) {
+ bad_pos_arg(i + 1, "is of unsupported type matrix");
+ } else {
+ if (Rf_length(v) != data.nrows()) {
+ stop("incorrect size (%d) at position %d, expecting : %d", Rf_length(v), i + 1, data.nrows());
+ }
}
+ variables[i] = v;
+ ascending[i] = !is_desc;
+ }
+ variables.names() = quosures.names();
+
+ OrderVisitors o(variables, ascending, nargs);
+ IntegerVector index = o.apply();
+
+ DataFrameSubsetVisitors visitors(data, data.names());
+ List res = visitors.subset(index, get_class(data));
+
+ if (is<GroupedDataFrame>(data)) {
+ // so that all attributes are recalculated (indices ... )
+ // see the lazyness feature in GroupedDataFrame
+ // if we don't do that, we get the values of the un-arranged data
+ // set for free from subset (#1064)
+ res.attr("labels") = R_NilValue;
+ copy_vars(res, data);
+ return GroupedDataFrame(res).data();
+ }
+ else {
SET_ATTRIB(res, strip_group_attributes(res));
- return res ;
+ return res;
+ }
}
diff --git a/src/between.cpp b/src/between.cpp
index 1b29c06..b6e0202 100644
--- a/src/between.cpp
+++ b/src/between.cpp
@@ -1,9 +1,10 @@
+#include "pch.h"
#include <Rcpp.h>
using namespace Rcpp;
//' Do values in a numeric vector fall in specified range?
//'
-//' This is a shortcut for \code{x >= left & x <= right}, implemented
+//' This is a shortcut for `x >= left & x <= right`, implemented
//' efficiently in C++ for local values, and translated to the
//' appropriate SQL for remote tables.
//'
@@ -18,10 +19,22 @@ LogicalVector between(NumericVector x, double left, double right) {
int n = x.size();
LogicalVector out = no_init(n);
+ // Assume users know what they're doing with date/times. In the future
+ // should ensure that left and right are the correct class too.
+ if (x.attr("class") != R_NilValue && !Rf_inherits(x, "Date") && !Rf_inherits(x, "POSIXct")) {
+ warningcall(R_NilValue, "between() called on numeric vector with S3 class");
+ }
+
+ if (NumericVector::is_na(left) || NumericVector::is_na(right)) {
+ for (int i = 0; i < n; ++i)
+ out[i] = NA_LOGICAL;
+ return out;
+ }
+
for (int i = 0; i < n; ++i) {
if (NumericVector::is_na(x[i])) {
- out[i] = NA_REAL;
- } else if ( (x[i] >= left) && (x[i] <= right) ) {
+ out[i] = NA_LOGICAL;
+ } else if ((x[i] >= left) && (x[i] <= right)) {
out[i] = true;
} else {
out[i] = false;
@@ -35,7 +48,7 @@ LogicalVector between(NumericVector x, double left, double right) {
library(microbenchmark)
-betweenr <- function(x, left, right) {
+betweenr <- function(x, left, right){
x >= left & x <= right
}
diff --git a/src/bind.cpp b/src/bind.cpp
index 8d00a90..902f2eb 100644
--- a/src/bind.cpp
+++ b/src/bind.cpp
@@ -1,285 +1,440 @@
-#include <dplyr.h>
+#include "pch.h"
+#include <dplyr/main.h>
-using namespace Rcpp ;
-using namespace dplyr ;
+#include <boost/scoped_ptr.hpp>
-class DataFrameAbleVector {
-public:
+#include <tools/all_na.h>
+#include <tools/collapse.h>
+#include <tools/pointer_vector.h>
+#include <tools/utils.h>
- DataFrameAbleVector() : data(){}
+#include <dplyr/GroupedDataFrame.h>
+#include <dplyr/Collecter.h>
+#include <dplyr/bad.h>
- inline void push_back( SEXP x) {
- data.push_back( DataFrameAble(x) ) ;
+using namespace Rcpp;
+using namespace dplyr;
+
+
+// From Rcpp::DataFrame
+static
+int df_rows_length(SEXP df) {
+ SEXP n = R_NilValue;
+ SEXP attrs = ATTRIB(df);
+ while (attrs != R_NilValue) {
+ if (TAG(attrs) == R_RowNamesSymbol) {
+ n = CAR(attrs) ;
+ break ;
+ }
+ attrs = CDR(attrs) ;
}
- inline const DataFrameAble& operator[]( int i) const {
- return data[i] ;
+ if (n == R_NilValue)
+ return 0;
+ else if (TYPEOF(n) == INTSXP && LENGTH(n) == 2 && INTEGER(n)[0] == NA_INTEGER)
+ return abs(INTEGER(n)[1]);
+ else
+ return LENGTH(n);
+}
+
+static
+R_xlen_t rows_length(SEXP x, bool rowwise) {
+ if (TYPEOF(x) == VECSXP) {
+ if (Rf_inherits(x, "data.frame"))
+ return df_rows_length(x);
+ else if (Rf_xlength(x) > 0)
+ return Rf_xlength(VECTOR_ELT(x, 0));
+ else
+ return 0;
+ } else {
+ if (rowwise)
+ return 1;
+ else
+ return Rf_xlength(x);
}
+}
+static
+R_xlen_t cols_length(SEXP x) {
+ if (TYPEOF(x) == VECSXP)
+ return Rf_xlength(x);
+ else
+ return 1;
+}
+
+static
+void inner_vector_check(SEXP x, int nrows, int arg) {
+ if (!is_vector(x))
+ bad_pos_arg(arg + 1, "is a list, must contain atomic vectors");
- inline int size() const {
- return data.size() ;
+ if (OBJECT(x)) {
+ if (Rf_inherits(x, "data.frame"))
+ bad_pos_arg(arg + 1, "can't be a list containing data frames");
+ if (Rf_inherits(x, "POSIXlt"))
+ bad_pos_arg(arg + 1, "can't be a list containing POSIXlt values");
}
- ~DataFrameAbleVector(){
- while (data.size()) data.pop_back();
+ if (Rf_length(x) != nrows) {
+ bad_pos_arg(arg + 1, "must be length {expected_size}, not {actual_size}",
+ _["expected_size"] = nrows, _["actual_size"] = Rf_length(x));
}
+}
-private:
- std::vector<DataFrameAble> data ;
-} ;
-
-template <typename Dots>
-List rbind__impl( Dots dots, SEXP id = R_NilValue ){
- int ndata = dots.size() ;
- int n = 0 ;
- DataFrameAbleVector chunks ;
- std::vector<int> df_nrows ;
-
- int k=0 ;
- for( int i=0; i<ndata; i++) {
- SEXP obj = dots[i] ;
- if( Rf_isNull(obj) ) continue ;
- chunks.push_back( obj ) ;
- int nrows = chunks[k].nrows() ;
- df_nrows.push_back(nrows) ;
- n += nrows ;
- k++ ;
- }
- ndata = chunks.size() ;
- pointer_vector<Collecter> columns ;
-
- std::vector<String> names ;
-
- k=0 ;
- Function enc2native( "enc2native" ) ;
- for( int i=0; i<ndata; i++){
- Rcpp::checkUserInterrupt() ;
-
- const DataFrameAble& df = chunks[i] ;
- if( !df.size() ) continue ;
-
- int nrows = df.nrows() ;
-
- CharacterVector df_names = enc2native(df.names()) ;
- for( int j=0; j<df.size(); j++){
- SEXP source = df.get(j) ;
- String name = df_names[j] ;
-
- Collecter* coll = 0;
- size_t index = 0 ;
- for( ; index < names.size(); index++){
- if( name == names[index] ){
- coll = columns[index] ;
- break ;
- }
- }
- if( ! coll ){
- coll = collecter( source, n ) ;
- columns.push_back( coll );
- names.push_back(name) ;
- }
- if( coll->compatible(source) ){
- // if the current source is compatible, collect
- coll->collect( SlicingIndex( k, nrows), source ) ;
-
- } else if( coll->can_promote(source) ) {
- // setup a new Collecter
- Collecter* new_collecter = promote_collecter(source, n, coll ) ;
-
- // import data from this chunk
- new_collecter->collect( SlicingIndex( k, nrows), source ) ;
-
- // import data from previous collecter
- new_collecter->collect( SlicingIndex(0, k), coll->get() ) ;
-
- // dispose the previous collecter and keep the new one.
- delete coll ;
- columns[index] = new_collecter ;
-
- } else if( all_na(source) ) {
- // do nothing, the collecter already initialized data with the
- // right NA
- } else if( coll->is_logical_all_na() ) {
- Collecter* new_collecter = collecter( source, n ) ;
- new_collecter->collect( SlicingIndex(k, nrows), source ) ;
- delete coll ;
- columns[index] = new_collecter ;
- } else {
- std::string column_name(name) ;
- stop(
- "Can not automatically convert from %s to %s in column \"%s\".",
- coll->describe(), get_single_class(source), column_name
- ) ;
- }
+static
+bool is_non_data_frame_object(SEXP x) {
+ if (TYPEOF(x) != VECSXP) return false;
+ if (!OBJECT(x)) return false;
+ return !Rf_inherits(x, "data.frame");
+}
- }
+static
+void rbind_vector_check(SEXP x, R_xlen_t nrows, int arg) {
+ if (!is_vector(x) || is_non_data_frame_object(x)) {
+ bad_pos_arg(arg + 1, "must be a data frame or a named atomic vector, not a {type}",
+ _["type"] = get_single_class(x));
+ }
+
+ if (rows_length(x, true) != nrows) {
+ bad_pos_arg(arg + 1, "must be length {expected_size}, not {actual_size}",
+ _["expected_size"] = rows_length(x, true), _["actual_size"] = nrows);
+ }
+
+ if (vec_names(x) == R_NilValue) {
+ bad_pos_arg(arg + 1, "must have names");
+ }
+}
+static
+void cbind_vector_check(SEXP x, R_xlen_t nrows, SEXP contr, int arg) {
+ if (is_atomic(x) && !has_name_at(contr, arg))
+ bad_pos_arg(arg + 1, "must have names");
+ if (rows_length(x, false) != nrows) {
+ bad_pos_arg(arg + 1, "must be length {expected_size}, not {actual_size}",
+ _["expected_size"] = rows_length(x, true), _["actual_size"] = nrows);
+ }
+}
+
+static
+void rbind_type_check(SEXP x, int nrows, int arg) {
+ int n = Rf_length(x);
+ if (n == 0)
+ return;
- k += nrows ;
+ rbind_vector_check(x, nrows, arg);
+
+ if (TYPEOF(x) == VECSXP) {
+ for (int i = 0; i < n; i++)
+ inner_vector_check(VECTOR_ELT(x, i), nrows, i);
+ }
+}
+static
+void cbind_type_check(SEXP x, int nrows, SEXP contr, int arg) {
+ int n = Rf_length(x);
+ if (n == 0)
+ return;
+
+ cbind_vector_check(x, nrows, contr, arg);
+
+ if (TYPEOF(x) == VECSXP) {
+ if (OBJECT(x) && !Rf_inherits(x, "data.frame")) {
+ bad_pos_arg(arg + 1, "must be a data frame or a named atomic vector, not a {type}",
+ _["type"] = get_single_class(x));
}
+ for (int i = 0; i < n; i++)
+ inner_vector_check(VECTOR_ELT(x, i), nrows, i);
+ }
+}
+
+extern "C"
+bool dplyr_is_bind_spliceable(SEXP x) {
+ if (TYPEOF(x) != VECSXP)
+ return false;
- int nc = columns.size() ;
- int has_id = Rf_isNull(id) ? 0 : 1;
+ if (OBJECT(x))
+ return Rf_inherits(x, "spliced");
- List out(nc + has_id) ;
- CharacterVector out_names(nc + has_id) ;
- for( int i=0; i<nc; i++){
- out[i + has_id] = columns[i]->get() ;
- out_names[i + has_id] = names[i] ;
+ for (R_xlen_t i = 0; i != Rf_xlength(x); ++i) {
+ if (is_atomic(VECTOR_ELT(x, i)))
+ return false;
+ }
+
+ return true;
+}
+
+// [[Rcpp::export]]
+SEXP flatten_bindable(SEXP x) {
+ // FIXME: This is temporary and should be replaced with rlang::flatten_if()
+ typedef bool(*is_spliceable_t)(SEXP);
+ typedef SEXP(*rlang_squash_if_t)(SEXP, SEXPTYPE, is_spliceable_t, int);
+
+ static rlang_squash_if_t rlang_squash_if = (rlang_squash_if_t)R_GetCCallable("rlang", "rlang_squash_if");
+
+ return rlang_squash_if(x, VECSXP, &dplyr_is_bind_spliceable, 1);
+}
+
+List rbind__impl(List dots, const SymbolString& id) {
+ int ndata = dots.size();
+ R_xlen_t n = 0;
+ std::vector<SEXP> chunks;
+ std::vector<R_xlen_t> df_nrows;
+ std::vector<String> dots_names;
+
+ chunks.reserve(ndata);
+ df_nrows.reserve(ndata);
+ dots_names.reserve(ndata);
+
+ int k = 0;
+ for (int i = 0; i < ndata; i++) {
+ SEXP obj = dots[i];
+ if (Rf_isNull(obj)) continue;
+ chunks.push_back(obj);
+ R_xlen_t nrows = rows_length(chunks[k], true);
+ df_nrows.push_back(nrows);
+ n += nrows;
+ if (!id.is_empty()) {
+ dots_names.push_back(name_at(dots, i));
}
+ k++;
+ }
+ ndata = chunks.size();
+ pointer_vector<Collecter> columns;
- // Add vector of identifiers if .id is supplied
- if (!Rf_isNull(id)) {
- CharacterVector df_names = dots.names() ;
- CharacterVector id_col = no_init(n) ;
+ SymbolVector names;
- CharacterVector::iterator it = id_col.begin() ;
- for (int i=0; i<ndata; ++i) {
- std::fill( it, it + df_nrows[i], df_names[i] ) ;
- it += df_nrows[i] ;
+ k = 0;
+ for (int i = 0; i < ndata; i++) {
+ Rcpp::checkUserInterrupt();
+
+ SEXP df = chunks[i];
+ R_xlen_t nrows = df_nrows[i];
+ rbind_type_check(df, nrows, i);
+
+ SymbolVector df_names(vec_names(df));
+ for (int j = 0; j < Rf_length(df); j++) {
+
+ SEXP source;
+ int offset;
+ if (TYPEOF(df) == VECSXP) {
+ source = VECTOR_ELT(df, j);
+ offset = 0;
+ } else {
+ source = df;
+ offset = j;
}
- out[0] = id_col ;
- out_names[0] = Rcpp::as<std::string>(id) ;
- }
- out.attr( "names" ) = out_names ;
- set_rownames( out, n ) ;
-
- // infer the classes and extra info (groups, etc ) from the first (#1692)
- if( ndata ){
- const DataFrameAble& first = chunks[0] ;
- if( first.is_dataframe() ){
- DataFrame df = first.get() ;
- out.attr("class") = df.attr("class") ;
- if( df.inherits("grouped_df") ){
- out.attr("vars") = df.attr("vars") ;
- out = GroupedDataFrame(out).data() ;
+
+ SymbolString name = df_names[j];
+
+ Collecter* coll = 0;
+ R_xlen_t index = 0;
+ for (; index < names.size(); index++) {
+ if (name == names[index]) {
+ coll = columns[index];
+ break;
}
+ }
+ if (!coll) {
+ coll = collecter(source, n);
+ columns.push_back(coll);
+ names.push_back(name);
+ }
+ if (coll->compatible(source)) {
+ // if the current source is compatible, collect
+ coll->collect(OffsetSlicingIndex(k, nrows), source, offset);
+ } else if (coll->can_promote(source)) {
+ // setup a new Collecter
+ Collecter* new_collecter = promote_collecter(source, n, coll);
+
+ // import data from this chunk
+ new_collecter->collect(OffsetSlicingIndex(k, nrows), source, offset);
+
+ // import data from previous collecter
+ new_collecter->collect(NaturalSlicingIndex(k), coll->get());
+
+ // dispose the previous collecter and keep the new one.
+ delete coll;
+ columns[index] = new_collecter;
+
+ } else if (all_na(source)) {
+ // do nothing, the collecter already initialized data with the
+ // right NA
+ } else if (coll->is_logical_all_na()) {
+ Collecter* new_collecter = collecter(source, n);
+ new_collecter->collect(OffsetSlicingIndex(k, nrows), source, offset);
+ delete coll;
+ columns[index] = new_collecter;
} else {
- out.attr( "class" ) = classes_not_grouped() ;
+ bad_col(SymbolString(name), "can't be converted from {source_type} to {target_type}",
+ _["source_type"] = coll->describe(), _["target_type"] = get_single_class(source));
+ }
+
+ }
+
+ k += nrows;
+ }
+
+ int nc = columns.size();
+ int has_id = id.is_empty() ? 0 : 1;
+
+ List out(no_init(nc + has_id));
+ SymbolVector out_names(no_init(nc + has_id));
+ for (int i = 0; i < nc; i++) {
+ out[i + has_id] = columns[i]->get();
+ out_names.set(i + has_id, names[i]);
+ }
+
+ // Add vector of identifiers if .id is supplied
+ if (!id.is_empty()) {
+ CharacterVector id_col = no_init(n);
+
+ CharacterVector::iterator it = id_col.begin();
+ for (int i = 0; i < ndata; ++i) {
+ std::fill(it, it + df_nrows[i], dots_names[i]);
+ it += df_nrows[i];
+ }
+ out[0] = id_col;
+ out_names.set(0, id);
+ }
+ out.attr("names") = out_names;
+ set_rownames(out, n);
+
+ // infer the classes and extra info (groups, etc ) from the first (#1692)
+ if (ndata) {
+ SEXP first = chunks[0];
+ if (Rf_inherits(first, "data.frame")) {
+ set_class(out, get_class(first));
+ if (Rf_inherits(first, "grouped_df")) {
+ copy_vars(out, first);
+ out = GroupedDataFrame(out).data();
}
} else {
- out.attr( "class" ) = classes_not_grouped() ;
+ set_class(out, classes_not_grouped());
}
+ } else {
+ set_class(out, classes_not_grouped());
+ }
- return out ;
+ return out;
}
// [[Rcpp::export]]
-List bind_rows_( List dots, SEXP id = R_NilValue ){
- return rbind__impl(dots, id) ;
+List bind_rows_(List dots, SEXP id) {
+ if (Rf_isNull(id))
+ return rbind__impl(dots, SymbolString());
+ else
+ return rbind__impl(dots, SymbolString(Rcpp::as<String>(id)));
}
// [[Rcpp::export]]
-List rbind_list__impl( Dots dots ){
- return rbind__impl(dots) ;
-}
+List cbind_all(List dots) {
+ int n_dots = dots.size();
+
+ // First check that the number of rows is the same based on first
+ // nonnull element
+ int first_i = -1;
+ for (int i = 0; i != n_dots; ++i) {
+ if (dots[i] != R_NilValue) {
+ first_i = i;
+ break;
+ }
+ }
-template <typename Dots>
-List cbind__impl( Dots dots ){
- int n = dots.size() ;
+ if (!n_dots || first_i == -1)
+ return DataFrame();
- DataFrameAbleVector chunks ;
- for( int i=0; i<n; i++) {
- SEXP obj = dots[i] ;
- if( !Rf_isNull(obj) )
- chunks.push_back( dots[i] );
- }
- n = chunks.size() ;
-
- // first check that the number of rows is the same
- const DataFrameAble& df = chunks[0] ;
- int nrows = df.nrows() ;
- int nv = df.size() ;
- for( int i=1; i<n; i++){
- const DataFrameAble& current = chunks[i] ;
- if( current.nrows() != nrows ){
- stop( "incompatible number of rows (%d, expecting %d)", current.nrows(), nrows ) ;
- }
- nv += current.size() ;
+ SEXP first = dots[first_i];
+ const R_xlen_t nrows = rows_length(first, false);
+ cbind_type_check(first, nrows, dots, 0);
+
+ R_xlen_t nv = cols_length(first);
+
+ for (int i = first_i + 1; i < n_dots; i++) {
+ SEXP current = dots[i];
+ if (Rf_isNull(current))
+ continue;
+
+ cbind_type_check(current, nrows, dots, i);
+ nv += cols_length(current);
}
// collect columns
- List out(nv) ;
- CharacterVector out_names(nv) ;
+ List out(nv);
+ CharacterVector out_names(nv);
+ SEXP dots_names = vec_names(dots);
// then do the subsequent dfs
- for( int i=0, k=0 ; i<n; i++){
- Rcpp::checkUserInterrupt() ;
-
- const DataFrameAble& current = chunks[i] ;
- CharacterVector current_names = current.names() ;
- int nc = current.size() ;
- for( int j=0; j<nc; j++, k++){
- out[k] = shared_SEXP(current.get(j)) ;
- out_names[k] = current_names[j] ;
+ for (int i = first_i, k = 0; i < n_dots; i++) {
+ SEXP current = dots[i];
+ if (Rf_isNull(current))
+ continue;
+
+ if (TYPEOF(current) == VECSXP) {
+ CharacterVector current_names = vec_names(current);
+ int nc = Rf_length(current);
+ for (int j = 0; j < nc; j++, k++) {
+ out[k] = shared_SEXP(VECTOR_ELT(current, j));
+ out_names[k] = current_names[j];
}
+ } else {
+ out[k] = current;
+ out_names[k] = STRING_ELT(dots_names, i);
+ k++;
+ }
+
+ Rcpp::checkUserInterrupt();
}
// infer the classes and extra info (groups, etc ) from the first (#1692)
- if( n ){
- const DataFrameAble& first = chunks[0] ;
- if( first.is_dataframe() ){
- DataFrame df = first.get() ;
- copy_most_attributes(out, df) ;
- } else {
- out.attr( "class" ) = classes_not_grouped() ;
- }
+ if (Rf_inherits(first, "data.frame")) {
+ copy_most_attributes(out, first);
} else {
- out.attr( "class" ) = classes_not_grouped() ;
+ set_class(out, classes_not_grouped());
}
- out.names() = out_names ;
- set_rownames( out, nrows ) ;
- return out ;
-}
+ out.names() = out_names;
+ set_rownames(out, nrows);
-// [[Rcpp::export]]
-List cbind_all( List dots ){
- return cbind__impl( dots ) ;
+ return out;
}
// [[Rcpp::export]]
-SEXP combine_all( List data ){
- int nv = data.size() ;
- if( nv == 0 ) stop("combine_all needs at least one vector") ;
-
- // get the size of the output
- int n = 0 ;
- for( int i=0; i<nv; i++){
- n += Rf_length(data[i]) ;
- }
+SEXP combine_all(List data) {
+ int nv = data.size();
+ if (nv == 0) stop("combine_all needs at least one vector");
+
+ // get the size of the output
+ int n = 0;
+ for (int i = 0; i < nv; i++) {
+ n += Rf_length(data[i]);
+ }
- // go to the first non NULL
- int i=0;
- for( ; i<nv; i++){
- if( !Rf_isNull(data[i]) ) break ;
- }
- if( i == nv) stop( "no data to combine, all elements are NULL" ) ;
-
- // collect
- boost::scoped_ptr<Collecter> coll( collecter( data[i], n ) ) ;
- int k = Rf_length(data[i]) ;
- coll->collect( SlicingIndex(0, k), data[i] ) ;
- i++;
- for(; i<nv; i++){
- SEXP current = data[i] ;
- if( Rf_isNull(current)) continue ;
- int n_current= Rf_length(current) ;
-
- if( coll->compatible(current) ){
- coll->collect( SlicingIndex(k, n_current), current ) ;
- } else if( coll->can_promote(current) ) {
- Collecter* new_coll = promote_collecter(current, n, coll.get() ) ;
- new_coll->collect( SlicingIndex(k, n_current), current ) ;
- new_coll->collect( SlicingIndex(0, k), coll->get() ) ;
- coll.reset( new_coll ) ;
- } else {
- stop(
- "Can not automatically convert from %s to %s.",
- get_single_class(coll->get()), get_single_class(current)
- ) ;
- }
- k += n_current ;
+ // go to the first non NULL
+ int i = 0;
+ for (; i < nv; i++) {
+ if (!Rf_isNull(data[i])) break;
+ }
+ if (i == nv) stop("no data to combine, all elements are NULL");
+
+ // collect
+ boost::scoped_ptr<Collecter> coll(collecter(data[i], n));
+ int k = Rf_length(data[i]);
+ coll->collect(NaturalSlicingIndex(k), data[i]);
+ i++;
+ for (; i < nv; i++) {
+ SEXP current = data[i];
+ if (Rf_isNull(current)) continue;
+ int n_current = Rf_length(current);
+
+ if (coll->compatible(current)) {
+ coll->collect(OffsetSlicingIndex(k, n_current), current);
+ } else if (coll->can_promote(current)) {
+ Collecter* new_coll = promote_collecter(current, n, coll.get());
+ new_coll->collect(OffsetSlicingIndex(k, n_current), current);
+ new_coll->collect(NaturalSlicingIndex(k), coll->get());
+ coll.reset(new_coll);
+ } else {
+ bad_pos_arg(i + 1, "can't be converted from {source_type} to {target_type}",
+ _["source_type"] = get_single_class(current), _["target_type"] = get_single_class(coll->get()));
}
+ k += n_current;
+ }
- return coll->get() ;
+ return coll->get();
}
diff --git a/src/combine_variables.cpp b/src/combine_variables.cpp
index 319435b..5159715 100644
--- a/src/combine_variables.cpp
+++ b/src/combine_variables.cpp
@@ -1,3 +1,4 @@
+#include "pch.h"
#include <Rcpp.h>
using namespace Rcpp;
@@ -37,7 +38,7 @@ class VarList {
}
public:
- VarList(int n) : out_indx(), out_name() {
+ explicit VarList(int n) : out_indx(), out_name() {
out_indx.reserve(n);
out_name.reserve(n);
}
@@ -85,9 +86,9 @@ SEXP combine_vars(CharacterVector vars, ListOf<IntegerVector> xs) {
SEXP raw_names = Rf_getAttrib(xs, Rf_mkString("names"));
CharacterVector xs_names;
if (raw_names == R_NilValue) {
- xs_names = CharacterVector(xs.size(), "" );
+ xs_names = CharacterVector(xs.size(), "");
} else {
- xs_names = raw_names ;
+ xs_names = raw_names;
}
// If first component is negative, pre-fill with existing vars
@@ -121,7 +122,7 @@ SEXP combine_vars(CharacterVector vars, ListOf<IntegerVector> xs) {
}
}
} else if (has_names) {
- CharacterVector names = x.names() ;
+ CharacterVector names = x.names();
for (int j = 0; j < x.size(); ++j) {
selected.update(x[j], names[j]);
}
diff --git a/src/distinct.cpp b/src/distinct.cpp
index efbdd06..dc14e6c 100644
--- a/src/distinct.cpp
+++ b/src/distinct.cpp
@@ -1,33 +1,56 @@
-#include <dplyr.h>
+#include "pch.h"
+#include <dplyr/main.h>
-using namespace Rcpp ;
-using namespace dplyr ;
+#include <dplyr/visitor_set/VisitorSetIndexSet.h>
+
+#include <dplyr/RowwiseDataFrame.h>
+#include <dplyr/MultipleVectorVisitors.h>
+#include <dplyr/DataFrameSubsetVisitors.h>
+#include <dplyr/Result/Count_Distinct.h>
+
+using namespace Rcpp;
+using namespace dplyr;
+
+SEXP select_not_grouped(const DataFrame& df, const SymbolVector& keep, const SymbolVector& new_names);
-SEXP select_not_grouped( const DataFrame& df, const CharacterVector& keep, CharacterVector new_names );
// [[Rcpp::export]]
-SEXP distinct_impl( DataFrame df, CharacterVector vars, CharacterVector keep){
- if( df.size() == 0 )
- return df ;
+SEXP distinct_impl(DataFrame df, const SymbolVector& vars, const SymbolVector& keep) {
+ if (df.size() == 0)
+ return df;
- // No vars means ungrouped data with keep_all = TRUE.
- if ( vars.size() == 0 )
- return df;
+ // No vars means ungrouped data with keep_all = TRUE.
+ if (vars.size() == 0)
+ return df;
- check_valid_colnames(df) ;
- if( !vars.size() ){
- vars = df.names() ;
- }
- DataFrameVisitors visitors(df, vars) ;
+ check_valid_colnames(df);
+ DataFrameVisitors visitors(df, vars);
- std::vector<int> indices ;
- VisitorSetIndexSet<DataFrameVisitors> set(visitors) ;
+ std::vector<int> indices;
+ VisitorSetIndexSet<DataFrameVisitors> set(visitors);
- int n = df.nrows() ;
- for( int i=0; i<n; i++){
- if( set.insert(i).second ){
- indices.push_back(i) ;
- }
+ int n = df.nrows();
+ for (int i = 0; i < n; i++) {
+ if (set.insert(i).second) {
+ indices.push_back(i);
}
+ }
+
+ return DataFrameSubsetVisitors(df, keep).subset(indices, get_class(df));
+}
+
+// [[Rcpp::export]]
+SEXP n_distinct_multi(List variables, bool na_rm = false) {
+ if (variables.length() == 0) {
+ stop("Need at least one column for `n_distinct()`");
+ }
- return DataFrameSubsetVisitors(df, keep).subset(indices, df.attr("class")) ;
+ MultipleVectorVisitors visitors(variables);
+ NaturalSlicingIndex everything(visitors.nrows());
+ if (na_rm) {
+ Count_Distinct_Narm<MultipleVectorVisitors> counter(visitors);
+ return counter.process(everything);
+ } else {
+ Count_Distinct<MultipleVectorVisitors> counter(visitors);
+ return counter.process(everything);
+ }
}
diff --git a/src/dplyr.cpp b/src/dplyr.cpp
deleted file mode 100644
index 45b9c59..0000000
--- a/src/dplyr.cpp
+++ /dev/null
@@ -1,1830 +0,0 @@
-#include <dplyr.h>
-
-using namespace Rcpp ;
-using namespace dplyr ;
-
-typedef dplyr_hash_map<SEXP,HybridHandler> HybridHandlerMap ;
-
-bool has_no_class( const RObject& arg) {
- return RCPP_GET_CLASS(arg) == R_NilValue ;
-}
-
-bool hybridable( RObject arg ){
- if( Rf_inherits(arg, "Date") || Rf_inherits(arg, "POSIXct") || Rf_inherits(arg, "difftime") ) return true ;
-
- if( arg.isObject() || arg.isS4() ) return false ;
- int type = arg.sexp_type() ;
- switch( type ){
- case INTSXP:
- case REALSXP:
- case LGLSXP:
- case STRSXP:
- case CPLXSXP:
- case RAWSXP:
- return has_no_class(arg) ;
- default: break ;
- }
- return false ;
-}
-
-template <template <int,bool> class Fun, bool narm>
-Result* simple_prototype_impl( SEXP arg, bool is_summary ){
- // if not hybridable, just let R handle it
- if( !hybridable(arg) ) return 0 ;
-
- switch( TYPEOF(arg) ){
- case INTSXP:
- return new Fun<INTSXP,narm>( arg, is_summary ) ;
- case REALSXP:
- return new Fun<REALSXP,narm>( arg, is_summary ) ;
- default: break ;
- }
- return 0 ;
-}
-
-template <template <int,bool> class Fun>
-Result* simple_prototype( SEXP call, const LazySubsets& subsets, int nargs ){
- if( nargs == 0 ) return 0 ;
- SEXP arg = CADR(call) ;
- bool is_summary = false ;
- if( TYPEOF(arg) == SYMSXP ){
- if( subsets.count(arg) ) {
- // we have a symbol from the data - great
- is_summary = subsets.is_summary(arg) ;
- arg = subsets.get_variable(arg) ;
- } else {
- // we have a symbol but we don't know about it, so we give up and let R evaluation handle it
- return 0 ;
- }
- } else {
- // anything else: expressions, constants ...
- // workaround for now : we just let R deal with it
- // of course this needs some specializations, i.e. sum(1) does not need R to get involved
- return 0 ;
- }
-
- if( nargs == 1 ){
- return simple_prototype_impl<Fun, false>( arg, is_summary ) ;
- } else if(nargs == 2 ){
- SEXP arg2 = CDDR(call) ;
- // we know how to handle fun( ., na.rm = TRUE/FALSE )
- if( TAG(arg2) == R_NaRmSymbol ){
- SEXP narm = CAR(arg2) ;
- if( TYPEOF(narm) == LGLSXP && LENGTH(narm) == 1 ){
- if( LOGICAL(narm)[0] == TRUE ){
- return simple_prototype_impl<Fun, true>( arg, is_summary ) ;
- } else {
- return simple_prototype_impl<Fun, false>( arg, is_summary ) ;
- }
- }
- }
- }
- return 0 ;
-}
-
-template< template <int, bool> class Tmpl, bool narm>
-Result* minmax_prototype_impl(SEXP arg, bool is_summary){
- if( !hybridable(arg) ) return 0 ;
-
- switch( TYPEOF(arg) ){
- case INTSXP:
- return new Tmpl<INTSXP,narm>( arg, is_summary ) ;
- case REALSXP:
- return new Tmpl<REALSXP,narm>( arg, is_summary ) ;
- default: break ;
- }
- return 0 ;
-}
-
-template< template <int, bool> class Tmpl>
-Result* minmax_prototype( SEXP call, const LazySubsets& subsets, int nargs ){
- using namespace dplyr ;
- // we only can handle 1 or two arguments
- if( nargs == 0 || nargs > 2 ) return 0 ;
-
- // the first argument is the data to operate on
- SEXP arg = CADR(call) ;
-
- bool is_summary = false ;
- if( TYPEOF(arg) == SYMSXP ){
- if( subsets.count(arg) ) {
- is_summary = subsets.is_summary(arg) ;
- arg = subsets.get_variable(arg) ;
- }
- else return 0 ;
- } else {
- return 0 ;
- }
-
- if( nargs == 1 ){
- return minmax_prototype_impl<Tmpl,false>(arg, is_summary) ;
- } else if( nargs == 2 ){
- SEXP arg2 = CDDR(call) ;
- // we know how to handle fun( ., na.rm = TRUE/FALSE )
- if( TAG(arg2) == R_NaRmSymbol ){
- SEXP narm = CAR(arg2) ;
- if( TYPEOF(narm) == LGLSXP && LENGTH(narm) == 1 ){
- if( LOGICAL(narm)[0] == TRUE ){
- return minmax_prototype_impl<Tmpl,true>(arg, is_summary) ;
- } else {
- return minmax_prototype_impl<Tmpl,false>(arg, is_summary) ;
- }
- }
- }
- }
- return 0 ;
-}
-
-Result* count_prototype(SEXP args, const LazySubsets&, int){
- if( Rf_length(args) != 1)
- stop("n does not take arguments") ;
- return new Count ;
-}
-
-Result* count_distinct_prototype(SEXP call, const LazySubsets& subsets, int nargs){
- MultipleVectorVisitors visitors ;
- bool na_rm = false ;
-
- for( SEXP p = CDR(call) ; !Rf_isNull(p) ; p = CDR(p) ){
- SEXP x = CAR(p) ;
- if( !Rf_isNull(TAG(p)) && TAG(p) == Rf_install("na.rm") ){
- if( TYPEOF(x) == LGLSXP && Rf_length(x) == 1){
- na_rm = LOGICAL(x)[0] ;
- } else {
- stop("incompatible value for `na.rm` parameter") ;
- }
- } else if( TYPEOF(x) == SYMSXP ) {
- visitors.push_back( subsets.get_variable( x ) ) ;
- } else {
- return 0 ;
- }
- }
-
- if( visitors.size() == 0 ) {
- stop("need at least one column for n_distinct()");
- }
-
- if( na_rm ){
- return new Count_Distinct_Narm<MultipleVectorVisitors>(visitors) ;
- } else {
- return new Count_Distinct<MultipleVectorVisitors>(visitors) ;
- }
-}
-
-Result* row_number_prototype(SEXP call, const LazySubsets& subsets, int nargs ){
- if( nargs > 1 || subsets.size() == 0 ) return 0;
-
- if( nargs == 0 ) return new RowNumber_0() ;
-
- RObject data( CADR(call) );
- if( TYPEOF(data) == LANGSXP && CAR(data) == Rf_install("desc") ){
- data = CADR(data) ;
-
- if( TYPEOF(data) == SYMSXP ){
- if( subsets.count(data) ) data = subsets.get_variable(data) ;
- else return 0 ;
- }
- if (Rf_length(data) == subsets.nrows() ){
- switch( TYPEOF(data) ){
- case INTSXP: return new RowNumber<INTSXP, false>( data ) ;
- case REALSXP: return new RowNumber<REALSXP, false>( data ) ;
- case STRSXP: return new RowNumber<STRSXP, false>( data ) ;
- default: break;
- }
- }
- return 0 ;
- }
- if( TYPEOF(data) == SYMSXP ){
- if( subsets.count(data) ) data = subsets.get_variable(data) ;
- else return 0 ;
- }
- if (Rf_length(data) == subsets.nrows() ){
- switch( TYPEOF(data) ){
- case INTSXP: return new RowNumber<INTSXP,true>( data ) ;
- case REALSXP: return new RowNumber<REALSXP,true>( data ) ;
- case STRSXP: return new RowNumber<STRSXP,true>( data ) ;
- default: break;
- }
- }
- // we don't know how to handle it.
- return 0 ;
-}
-
-Result* ntile_prototype( SEXP call, const LazySubsets& subsets, int nargs ){
- if( nargs != 2 ) return 0;
-
- // handle 2nd arg
- SEXP ntiles = CADDR(call) ;
- double number_tiles ;
- try{
- number_tiles = as<int>(ntiles) ;
- } catch( ... ){
- stop("could not convert n to scalar integer") ;
- }
-
- RObject data( CADR(call) );
- if( TYPEOF(data) == LANGSXP && CAR(data) == Rf_install("desc") ){
- data = CADR(data) ;
-
- if( TYPEOF(data) == SYMSXP ){
- if( subsets.count(data) ) data = subsets.get_variable(data) ;
- else return 0 ;
- }
- switch( TYPEOF(data) ){
- case INTSXP: return new Ntile<INTSXP, false>( data, number_tiles ) ;
- case REALSXP: return new Ntile<REALSXP, false>( data, number_tiles ) ;
- case STRSXP: return new Ntile<STRSXP, false>( data, number_tiles ) ;
- default: break;
- }
- }
- if( TYPEOF(data) == SYMSXP ){
- if( subsets.count(data) ) data = subsets.get_variable(data) ;
- else return 0 ;
- }
- if( subsets.nrows() != Rf_length(data) ) return 0 ;
-
- switch( TYPEOF(data) ){
- case INTSXP: return new Ntile<INTSXP ,true>( data, number_tiles ) ;
- case REALSXP: return new Ntile<REALSXP,true>( data, number_tiles ) ;
- case STRSXP: return new Ntile<STRSXP ,true>( data, number_tiles ) ;
- default: break;
- }
- // we don't know how to handle it.
- return 0 ;
-}
-
-template <typename Increment>
-Result* rank_impl_prototype(SEXP call, const LazySubsets& subsets, int nargs ){
- if( nargs != 1) return 0;
- RObject data( CADR(call) );
-
- if( TYPEOF(data) == LANGSXP && CAR(data) == Rf_install("desc") ){
- data = CADR(data) ;
- if( TYPEOF(data) == SYMSXP ){
- if( subsets.count(data) ) data = subsets.get_variable(data) ;
- else return 0 ;
- }
-
- switch( TYPEOF(data) ){
- case INTSXP: return new Rank_Impl<INTSXP, Increment, false>( data ) ;
- case REALSXP: return new Rank_Impl<REALSXP, Increment, false>( data ) ;
- case STRSXP: return new Rank_Impl<STRSXP, Increment, false>( data ) ;
- default: break;
- }
- }
-
- if( TYPEOF(data) == SYMSXP ){
- if( subsets.count(data) ) data = subsets.get_variable(data) ;
- else return 0 ;
- }
- switch( TYPEOF(data) ){
- case INTSXP: return new Rank_Impl<INTSXP, Increment, true>( data ) ;
- case REALSXP: return new Rank_Impl<REALSXP, Increment, true>( data ) ;
- case STRSXP: return new Rank_Impl<STRSXP, Increment, true>( data ) ;
- default: break;
- }
- // we don't know how to handle it.
- return 0 ;
-}
-
-struct LeadLag{
-
- LeadLag( SEXP call ) : data(R_NilValue), n(1), def(R_NilValue), ok(true){
-
- SEXP p = CDR(call) ;
- SEXP tag = TAG(p) ;
- if( tag != R_NilValue && tag != Rf_install("x") ) {
- ok = false ;
- return ;
- }
- data = CAR(p) ;
-
- p = CDR(p);
- while( p != R_NilValue ){
- tag = TAG(p) ;
- if( tag != R_NilValue && tag != Rf_install("n") && tag != Rf_install("default") ) {
- ok = false ;
- return ;
- }
- if( tag == Rf_install("n") || tag == R_NilValue ){
- try{
- n = as<int>( CAR(p) );
- } catch( ... ){
- SEXP n_ = CADDR(call);
- std::stringstream s ;
- stop( "could not convert second argument to an integer. type=%s, length = %d",
- type2name(n_), Rf_length(n_) ) ;
- }
- }
- if( tag == Rf_install("default") ){
- def = CAR(p) ;
- if( TYPEOF(def) == LANGSXP ) ok = false ;
- }
- p = CDR(p) ;
- }
- }
-
- RObject data ;
- int n ;
- RObject def ;
-
- bool ok ;
-
-} ;
-
-template < template<int> class Templ>
-Result* leadlag_prototype(SEXP call, const LazySubsets& subsets, int nargs){
- LeadLag args(call) ;
- if( !args.ok ) return 0 ;
- RObject& data = args.data ;
-
- if( TYPEOF(data) == SYMSXP && subsets.count(data) ){
- bool is_summary = subsets.is_summary(data) ;
- int n = args.n ;
- data = subsets.get_variable(data) ;
-
- switch( TYPEOF(data) ){
- case INTSXP: return new Templ<INTSXP> (data, n, args.def, is_summary) ;
- case REALSXP: return new Templ<REALSXP>(data, n, args.def, is_summary) ;
- case STRSXP: return new Templ<STRSXP> (data, n, args.def, is_summary) ;
- case LGLSXP: return new Templ<LGLSXP> (data, n, args.def, is_summary) ;
- default: break ;
- }
-
- }
- return 0 ;
-}
-
-template < template <int> class Templ>
-Result* cumfun_prototype(SEXP call, const LazySubsets& subsets, int nargs){
- if( nargs != 1 ) return 0 ;
- RObject data( CADR(call) );
- if(TYPEOF(data) == SYMSXP) {
- data = subsets.get_variable(data) ;
- }
- switch( TYPEOF(data) ){
- case INTSXP: return new Templ<INTSXP>(data) ;
- case REALSXP: return new Templ<REALSXP>(data) ;
- default: break ;
- }
- return 0 ;
-}
-
-bool argmatch( const std::string& target, const std::string& s){
- if( s.size() > target.size() ) return false ;
- return target.compare( 0, s.size(), s ) == 0 ;
-}
-
-Result* in_prototype( SEXP call, const LazySubsets& subsets, int nargs){
- SEXP lhs = CADR(call) ;
- SEXP rhs = CADDR(call) ;
-
- // if lhs is not a symbol, let R handle it
- if( TYPEOF(lhs) != SYMSXP ) return 0 ;
-
- // if the lhs is not in the data, let R handle it
- if( !subsets.count(lhs) ) return 0 ;
-
- SEXP v = subsets.get_variable(lhs) ;
-
- // if the type of the data is not the same as the type of rhs,
- // including if it needs evaluation, let R handle it
- if( TYPEOF(v) != TYPEOF(rhs) ) return 0 ;
-
- // otherwise use hybrid version
- switch( TYPEOF(v) ){
- case STRSXP: return new In<STRSXP>(v, rhs) ;
- default: break ;
- }
-
- // type not handled
- return 0 ;
-
-}
-
-HybridHandlerMap& get_handlers(){
- static HybridHandlerMap handlers ;
- if( !handlers.size() ){
- handlers[ Rf_install( "n") ] = count_prototype ;
- handlers[ Rf_install( "n_distinct" ) ] = count_distinct_prototype ;
- handlers[ Rf_install( "row_number" ) ] = row_number_prototype ;
- handlers[ Rf_install( "ntile" ) ] = ntile_prototype ;
-
- handlers[ Rf_install( "min" ) ] = minmax_prototype<dplyr::Min> ;
- handlers[ Rf_install( "max" ) ] = minmax_prototype<dplyr::Max> ;
-
- handlers[ Rf_install( "mean" ) ] = simple_prototype<dplyr::Mean> ;
- handlers[ Rf_install( "var" ) ] = simple_prototype<dplyr::Var> ;
- handlers[ Rf_install( "sd") ] = simple_prototype<dplyr::Sd> ;
- handlers[ Rf_install( "sum" ) ] = simple_prototype<dplyr::Sum>;
-
- handlers[ Rf_install( "min_rank" ) ] = rank_impl_prototype<dplyr::internal::min_rank_increment> ;
- handlers[ Rf_install( "percent_rank" ) ] = rank_impl_prototype<dplyr::internal::percent_rank_increment> ;
- handlers[ Rf_install( "dense_rank" ) ] = rank_impl_prototype<dplyr::internal::dense_rank_increment> ;
- handlers[ Rf_install( "cume_dist" ) ] = rank_impl_prototype<dplyr::internal::cume_dist_increment> ;
-
- /*
- handlers[ Rf_install( "cumsum") ] = cumfun_prototype<CumSum> ;
- handlers[ Rf_install( "cummin") ] = cumfun_prototype<CumMin> ;
- handlers[ Rf_install( "cummax") ] = cumfun_prototype<CumMax> ;
- */
-
- handlers[ Rf_install( "lead" ) ] = leadlag_prototype<Lead> ;
- handlers[ Rf_install( "lag" ) ] = leadlag_prototype<Lag> ;
-
- handlers[ Rf_install( "first" ) ] = first_prototype ;
- handlers[ Rf_install( "last" ) ] = last_prototype ;
- handlers[ Rf_install( "nth" ) ] = nth_prototype ;
-
- // handlers[ Rf_install( "%in%" ) ] = in_prototype ;
-
- }
- return handlers ;
-}
-
-Result* constant_handler(SEXP constant){
- switch(TYPEOF(constant)){
- case INTSXP:
- {
- if( Rf_inherits(constant, "Date") ) return new TypedConstantResult<INTSXP>(constant, get_date_classes() ) ;
- return new ConstantResult<INTSXP>(constant) ;
- }
- case REALSXP:
- {
- if( Rf_inherits(constant, "difftime") ) return new DifftimeConstantResult<REALSXP>(constant) ;
- if( Rf_inherits(constant, "POSIXct") ) return new TypedConstantResult<REALSXP>(constant, get_time_classes() ) ;
- if( Rf_inherits(constant, "Date") ) return new TypedConstantResult<REALSXP>(constant, get_date_classes() ) ;
- return new ConstantResult<REALSXP>(constant) ;
- }
- case STRSXP: return new ConstantResult<STRSXP>(constant) ;
- case LGLSXP: return new ConstantResult<LGLSXP>(constant) ;
- }
- return 0;
-}
-
-Result* get_handler( SEXP call, const LazySubsets& subsets, const Environment& env ){
- if( TYPEOF(call) == LANGSXP ){
- int depth = Rf_length(call) ;
- HybridHandlerMap& handlers = get_handlers() ;
- SEXP fun_symbol = CAR(call) ;
- if( TYPEOF(fun_symbol) != SYMSXP ) return 0 ;
-
- HybridHandlerMap::const_iterator it = handlers.find( fun_symbol ) ;
- if( it == handlers.end() ) return 0 ;
-
- return it->second( call, subsets, depth - 1 );
- } else if( TYPEOF(call) == SYMSXP ){
- if( !subsets.count(call) ){
- SEXP data = env.find( CHAR(PRINTNAME(call)) ) ;
- if( Rf_length(data) == 1 ) return constant_handler(data) ;
- }
- } else {
- // TODO: perhaps deal with SYMSXP separately
- if( Rf_length(call) == 1 ) return constant_handler(call) ;
- }
- return 0 ;
-}
-
-void registerHybridHandler( const char* name, HybridHandler proto){
- get_handlers()[ Rf_install(name) ] = proto ;
-}
-
-bool can_simplify( SEXP call ){
- if( TYPEOF(call) == LISTSXP ){
- bool res = can_simplify( CAR(call) ) ;
- if( res ) return true ;
- return can_simplify( CDR(call) ) ;
- }
-
- if( TYPEOF(call) == LANGSXP ){
- SEXP fun_symbol = CAR(call) ;
- if( TYPEOF(fun_symbol) != SYMSXP ) return false ;
-
- if( get_handlers().count( fun_symbol ) ) return true ;
-
- return can_simplify( CDR(call) ) ;
- }
- return false ;
-}
-
-template <typename Index>
-DataFrame subset( DataFrame df, const Index& indices, CharacterVector columns, CharacterVector classes){
- return DataFrameSubsetVisitors(df, columns).subset(indices, classes) ;
-}
-
-template <typename Index>
-DataFrame subset( DataFrame df, const Index& indices, CharacterVector classes){
- return DataFrameSubsetVisitors(df).subset(indices, classes) ;
-}
-
-template <typename Index>
-DataFrame subset_join( DataFrame x, DataFrame y,
- const Index& indices_x, const Index& indices_y,
- CharacterVector by_x, CharacterVector by_y ,
- const std::string& suffix_x, const std::string& suffix_y,
- CharacterVector classes) {
- // first the joined columns
- DataFrameJoinVisitors join_visitors(x, y, by_x, by_y, false) ;
- int n_join_visitors = join_visitors.size() ;
-
- // then columns from x but not y
- CharacterVector all_x_columns = x.names() ;
- std::vector<bool> joiner( all_x_columns.size() ) ;
- CharacterVector x_columns( all_x_columns.size() - n_join_visitors ) ;
- IntegerVector xm = r_match( all_x_columns, by_x) ;
- for( int i=0, k=0; i<all_x_columns.size(); i++){
- if( xm[i] == NA_INTEGER ){
- joiner[i] = false ;
- x_columns[k++] = all_x_columns[i] ;
- } else {
- joiner[i] = true ;
- }
- }
- DataFrameSubsetVisitors visitors_x(x, x_columns) ;
- int nv_x = visitors_x.size() ;
-
- // then columns from y but not x
- CharacterVector all_y_columns = y.names() ;
- CharacterVector y_columns( all_y_columns.size() - n_join_visitors ) ;
- IntegerVector ym = r_match(all_y_columns, by_y) ;
- for( int i=0, k=0; i<all_y_columns.size(); i++){
- if( ym[i] == NA_INTEGER ){
- y_columns[k++] = all_y_columns[i] ;
- }
- }
- DataFrameSubsetVisitors visitors_y(y, y_columns) ;
-
- int nv_y = visitors_y.size() ;
-
- // construct out object
- int nrows = indices_x.size() ;
- List out(n_join_visitors+nv_x+nv_y);
- CharacterVector names(n_join_visitors+nv_x+nv_y) ;
-
- int index_join_visitor = 0 ;
- int index_x_visitor = 0 ;
- // ---- join visitors
- for( int i=0; i<all_x_columns.size(); i++){
- String col_name = all_x_columns[i] ;
- if( joiner[i] ){
- JoinVisitor* v = join_visitors.get(xm[i]-1) ;
- out[i] = v->subset(indices_x) ;
- index_join_visitor++ ;
- } else {
-
- while(
- ( std::find(y_columns.begin(), y_columns.end(), col_name.get_sexp()) != y_columns.end() ) ||
- ( std::find(names.begin(), names.begin() + i, col_name.get_sexp()) != names.begin() + i)
- ){
- col_name += suffix_x ;
- }
- out[i] = visitors_x.get(index_x_visitor)->subset(indices_x) ;
- index_x_visitor++ ;
- }
- names[i] = col_name ;
- }
-
- int k = index_join_visitor + index_x_visitor ;
- for( int i=0; i<nv_y; i++, k++){
- String col_name = y_columns[i] ;
-
- // we suffix by .y if this column is in x_columns
-
- while(
- ( std::find(all_x_columns.begin(), all_x_columns.end(), col_name.get_sexp()) != all_x_columns.end() ) ||
- ( std::find(names.begin(), names.begin() + k, col_name.get_sexp()) != names.begin() + k )
- ){
- col_name += suffix_y ;
- }
-
- out[k] = visitors_y.get(i)->subset(indices_y) ;
- names[k] = col_name ;
- }
- out.attr("class") = classes ;
- set_rownames(out, nrows) ;
- out.names() = names ;
-
- SEXP vars = x.attr( "vars" ) ;
- if( !Rf_isNull(vars) )
- out.attr( "vars" ) = vars ;
-
- return (SEXP)out ;
-}
-
-template <typename TargetContainer, typename SourceContainer>
-void push_back( TargetContainer& x, const SourceContainer& y ){
- x.insert( x.end(), y.begin(), y.end() ) ;
-}
-template <typename TargetContainer, typename SourceContainer>
-void push_back_right( TargetContainer& x, const SourceContainer& y ){
- // x.insert( x.end(), y.begin(), y.end() ) ;
- int n = y.size() ;
- for( int i=0; i<n; i++){
- x.push_back( -y[i]-1 ) ;
- }
-}
-
-template <typename Container>
-void push_back( Container& x, typename Container::value_type value, int n ){
- for( int i=0; i<n; i++)
- x.push_back( value ) ;
-}
-
-// [[Rcpp::export]]
-void assert_all_white_list(const DataFrame& data){
- // checking variables are on the white list
- int nc = data.size() ;
- for( int i=0; i<nc; i++){
- if( !white_list(data[i]) ){
- CharacterVector names = data.names() ;
- String name_i = names[i] ;
- SEXP v = data[i] ;
-
- SEXP klass = Rf_getAttrib(v, R_ClassSymbol) ;
- if( !Rf_isNull(klass) ){
- stop( "column '%s' has unsupported class : %s",
- name_i.get_cstring() , get_single_class(v) );
- }
- else {
- stop( "column '%s' has unsupported type : %s",
- name_i.get_cstring() , Rf_type2char(TYPEOF(v)) );
- }
-
- }
- }
-}
-
-// [[Rcpp::export]]
-DataFrame semi_join_impl( DataFrame x, DataFrame y, CharacterVector by_x, CharacterVector by_y ){
- if( by_x.size() == 0) stop("no variable to join by") ;
- typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map ;
- DataFrameJoinVisitors visitors(x, y, by_x, by_y, false) ;
- Map map(visitors);
-
- // train the map in terms of x
- train_push_back( map, x.nrows() ) ;
-
- int n_y = y.nrows() ;
- // this will collect indices from rows in x that match rows in y
- std::vector<int> indices ;
- for( int i=0; i<n_y; i++){
- // find a row in x that matches row i from y
- Map::iterator it = map.find(-i-1) ;
-
- if( it != map.end() ){
- // collect the indices and remove them from the
- // map so that they are only found once.
- push_back( indices, it->second ) ;
-
- map.erase(it) ;
-
- }
- }
-
- return subset(x, indices, x.names(), x.attr("class") ) ;
-}
-
-// [[Rcpp::export]]
-DataFrame anti_join_impl( DataFrame x, DataFrame y, CharacterVector by_x, CharacterVector by_y){
- if( by_x.size() == 0) stop("no variable to join by") ;
- typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map ;
- DataFrameJoinVisitors visitors(x, y, by_x, by_y, false) ;
- Map map(visitors);
-
- // train the map in terms of x
- train_push_back( map, x.nrows() ) ;
-
- int n_y = y.nrows() ;
- // remove the rows in x that match
- for( int i=0; i<n_y; i++){
- Map::iterator it = map.find(-i-1) ;
- if( it != map.end() )
- map.erase(it) ;
- }
-
- // collect what's left
- std::vector<int> indices ;
- for( Map::iterator it = map.begin() ; it != map.end(); ++it)
- push_back( indices, it->second ) ;
-
- return subset(x, indices, x.names(), x.attr( "class" ) ) ;
-}
-
-// [[Rcpp::export]]
-DataFrame inner_join_impl(DataFrame x, DataFrame y,
- CharacterVector by_x, CharacterVector by_y,
- std::string& suffix_x, std::string& suffix_y){
- if( by_x.size() == 0) stop("no variable to join by") ;
- typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map ;
- DataFrameJoinVisitors visitors(x, y, by_x, by_y, true) ;
- Map map(visitors);
-
- int n_x = x.nrows(), n_y = y.nrows() ;
-
- std::vector<int> indices_x ;
- std::vector<int> indices_y ;
-
- train_push_back_right( map, n_y ) ;
-
- for( int i=0; i<n_x; i++){
- Map::iterator it = map.find(i) ;
- if( it != map.end() ){
- push_back_right( indices_y, it->second );
- push_back( indices_x, i, it->second.size() ) ;
- }
- }
-
- return subset_join(
- x, y,
- indices_x, indices_y,
- by_x, by_y,
- suffix_x, suffix_y,
- x.attr( "class")
- );
-}
-
-// [[Rcpp::export]]
-DataFrame left_join_impl(DataFrame x, DataFrame y,
- CharacterVector by_x, CharacterVector by_y,
- std::string& suffix_x, std::string& suffix_y){
- if( by_x.size() == 0) stop("no variable to join by") ;
- typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map ;
- DataFrameJoinVisitors visitors(y, x, by_y, by_x, true) ;
-
- Map map(visitors);
-
- // train the map in terms of y
- train_push_back( map, y.nrows() ) ;
-
- std::vector<int> indices_x ;
- std::vector<int> indices_y ;
-
- int n_x = x.nrows() ;
- for( int i=0; i<n_x; i++){
- // find a row in y that matches row i in x
- Map::iterator it = map.find(-i-1) ;
- if( it != map.end() ){
- push_back( indices_y, it->second ) ;
- push_back( indices_x, i, it->second.size() ) ;
- } else {
- indices_y.push_back(-1) ; // mark NA
- indices_x.push_back(i) ;
- }
- }
-
- return subset_join(
- x, y,
- indices_x, indices_y,
- by_x, by_y,
- suffix_x, suffix_y,
- x.attr( "class" )
- );
-}
-
-// [[Rcpp::export]]
-DataFrame right_join_impl(DataFrame x, DataFrame y,
- CharacterVector by_x, CharacterVector by_y,
- std::string& suffix_x, std::string& suffix_y){
- if( by_x.size() == 0) stop("no variable to join by") ;
- typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map ;
- DataFrameJoinVisitors visitors(x, y, by_x, by_y, true) ;
- Map map(visitors);
-
- // train the map in terms of x
- train_push_back( map, x.nrows() ) ;
-
- std::vector<int> indices_x ;
- std::vector<int> indices_y ;
-
- int n_y = y.nrows() ;
- for( int i=0; i<n_y; i++){
- // find a row in y that matches row i in x
- Map::iterator it = map.find(-i-1) ;
- if( it != map.end() ){
- push_back( indices_x, it->second ) ;
- push_back( indices_y, i, it->second.size() ) ;
- } else {
- indices_x.push_back(-i-1) ; // point to the i-th row in the right table
- indices_y.push_back(i) ;
- }
- }
- return subset_join(
- x, y,
- indices_x, indices_y,
- by_x, by_y,
- suffix_x, suffix_y,
- x.attr( "class" )
- );
-}
-
-// [[Rcpp::export]]
-DataFrame full_join_impl(DataFrame x, DataFrame y,
- CharacterVector by_x, CharacterVector by_y,
- std::string& suffix_x, std::string& suffix_y){
- if( by_x.size() == 0) stop("no variable to join by") ;
- typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map ;
- DataFrameJoinVisitors visitors(y, x, by_y, by_x, true) ;
- Map map(visitors);
-
- // train the map in terms of y
- train_push_back( map, y.nrows() ) ;
-
- std::vector<int> indices_x ;
- std::vector<int> indices_y ;
-
- int n_x = x.nrows(), n_y = y.nrows() ;
-
- // get both the matches and the rows from left but not right
- for( int i=0; i<n_x; i++){
- // find a row in y that matches row i in x
- Map::iterator it = map.find(-i-1) ;
- if( it != map.end() ){
- push_back( indices_y, it->second ) ;
- push_back( indices_x, i, it->second.size() ) ;
- } else {
- indices_y.push_back(-1) ; // mark NA
- indices_x.push_back(i) ;
- }
- }
-
- // train a new map in terms of x this time
- DataFrameJoinVisitors visitors2(x,y,by_x,by_y, false) ;
- Map map2(visitors2);
- train_push_back( map2, x.nrows() ) ;
-
- for( int i=0; i<n_y; i++){
- // try to find row in x that matches this row of y
- Map::iterator it = map2.find(-i-1) ;
- if( it == map2.end() ){
- indices_x.push_back(-i-1) ;
- indices_y.push_back(i) ;
- }
- }
-
- return subset_join(x, y,
- indices_x, indices_y,
- by_x, by_y,
- suffix_x, suffix_y,
- x.attr( "class" )
- );
-}
-
-// [[Rcpp::export]]
-SEXP shallow_copy(const List& data){
- int n = data.size() ;
- List out(n) ;
- for( int i=0; i<n; i++) {
- out[i] = shared_SEXP(data[i]) ;
- }
- copy_attributes(out, data) ;
- return out ;
-}
-
-// [[Rcpp::export]]
-dplyr::BoolResult compatible_data_frame_nonames( DataFrame x, DataFrame y, bool convert){
- int n = x.size() ;
- if( n != y.size() )
- return no_because( tfm::format("different number of columns : %d x %d", n, y.size() ) ) ;
-
- if( convert ){
- for( int i=0; i<n; i++){
- try{
- boost::scoped_ptr<JoinVisitor> v( join_visitor( x[i], y[i], "x", "x", true ) ) ;
- } catch(...){
- return no_because( "incompatible" ) ;
- }
- }
- } else {
- for( int i=0; i<n; i++){
- SEXP xi = x[i], yi=y[i] ;
- if( TYPEOF(xi) != TYPEOF(yi))
- return no_because( "incompatible types" ) ;
-
- if( TYPEOF(xi) == INTSXP){
- if( Rf_inherits(xi, "factor") && Rf_inherits(yi, "factor") ){
- if( same_levels(xi, yi) ) continue ;
- return no_because( "factors with different levels" ) ;
- }
-
- if( Rf_inherits(xi, "factor") ) return no_because( "cannot compare factor and integer" ) ;
- if( Rf_inherits(yi, "factor") ) return no_because( "cannot compare factor and integer" ) ;
-
- }
- }
- }
-
- return yes() ;
-
-}
-
-// [[Rcpp::export]]
-dplyr::BoolResult compatible_data_frame( DataFrame x, DataFrame y, bool ignore_col_order = true, bool convert = false ){
- int n = x.size() ;
-
- bool null_x = Rf_isNull(x.names()), null_y = Rf_isNull(y.names()) ;
- if( null_x && !null_y ){
- return no_because( "x does not have names, but y does") ;
- } else if( null_y && !null_x){
- return no_because( "y does not have names, but x does") ;
- } else if( null_x && null_y){
- return compatible_data_frame_nonames(x,y, convert) ;
- }
-
- CharacterVector names_x = x.names() ;
- CharacterVector names_y = y.names() ;
-
- CharacterVector names_y_not_in_x = setdiff( names_y, names_x );
- CharacterVector names_x_not_in_y = setdiff( names_x, names_y );
-
- if( !ignore_col_order ){
- if( names_y_not_in_x.size() == 0 && names_y_not_in_x.size() == 0 ){
- // so the names are the same, check if they are in the same order
- for( int i=0; i<n; i++){
- if( names_x[i] != names_y[i] ){
- return no_because("Same column names, but different order") ;
- }
- }
- }
- }
-
- std::stringstream ss ;
- bool ok = true ;
- if( names_y_not_in_x.size() ){
- ok = false ;
- ss << "Cols in y but not x: " << collapse(names_y_not_in_x) << ". ";
- }
-
- if( names_x_not_in_y.size() ){
- ok = false ;
- ss << "Cols in x but not y: " << collapse(names_x_not_in_y) << ". ";
- }
-
- if(!ok){
- return no_because( ss.str() ) ;
- }
-
- IntegerVector orders = r_match( names_x, names_y ) ;
-
- String name ;
- for( int i=0; i<n; i++){
- name = names_x[i] ;
- SEXP xi = x[i], yi = y[orders[i]-1] ;
- boost::scoped_ptr<SubsetVectorVisitor> vx( subset_visitor( xi ) ) ;
- boost::scoped_ptr<SubsetVectorVisitor> vy( subset_visitor( yi ) ) ;
- SubsetVectorVisitor* px = vx.get() ;
- SubsetVectorVisitor* py = vy.get() ;
-
- if( typeid(*px) != typeid(*py) ) {
- ss << "Incompatible type for column "
- << name.get_cstring()
- << ": x " << vx->get_r_type()
- << ", y " << vy->get_r_type() ;
-
- if( !convert ) {
- ok = false ;
- continue ;
- }
- }
-
- if( ! vx->is_compatible( py, ss, name ) ) {
- ok = false ;
- }
- }
-
- if(!ok) return no_because( ss.str() ) ;
- return yes() ;
-}
-
-class RowTrack {
-public:
- RowTrack( const std::string& msg, int max_count_ = 10 ) : ss(), count(0), max_count(max_count_) {
- ss << msg ;
- }
-
- void record( int i){
- if( count > max_count ) return ;
- if( count ) ss << ", " ;
- int idx = i >= 0 ? (i+1) : -i ;
- ss << idx ;
- if( count == max_count ) ss << "[...]" ;
- count++ ;
- }
-
- bool empty() const {
- return count == 0 ;
- }
-
- std::string str() const {
- return ss.str() ;
- }
-
-private:
- std::stringstream ss ;
- int count ;
- int max_count ;
-} ;
-
-// [[Rcpp::export]]
-dplyr::BoolResult equal_data_frame(DataFrame x, DataFrame y, bool ignore_col_order = true, bool ignore_row_order = true, bool convert = false ){
- BoolResult compat = compatible_data_frame(x, y, ignore_col_order, convert);
- if( !compat ) return compat ;
-
- typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map ;
- DataFrameJoinVisitors visitors(x, y, x.names(), x.names(), true ) ;
- Map map(visitors);
-
- // train the map in both x and y
- int nrows_x = x.nrows() ;
- int nrows_y = y.nrows() ;
-
- if( nrows_x != nrows_y )
- return no_because( "Different number of rows" ) ;
- if( x.size() == 0 )
- return yes() ;
-
- for( int i=0; i<nrows_x; i++) map[i].push_back(i) ;
- for( int i=0; i<nrows_y; i++) map[-i-1].push_back(-i-1) ;
-
- RowTrack track_x( "Rows in x but not y: " ) ;
- RowTrack track_y( "Rows in y but not x: " ) ;
- RowTrack track_mismatch( "Rows with difference occurences in x and y: " ) ;
-
- bool ok = true ;
- Map::const_iterator it = map.begin() ;
-
- for( ; it != map.end(); ++it){
- // retrieve the indices ( -ves for y, +ves for x )
- const std::vector<int>& chunk = it->second ;
- int n = chunk.size() ;
-
- int count_left = 0, count_right = 0 ;
- for( int i=0; i<n; i++){
- if( chunk[i] < 0 )
- count_right++ ;
- else
- count_left++ ;
- }
- if( count_right == 0 ){
- track_x.record( chunk[0] ) ;
- ok = false ;
- } else if( count_left == 0){
- track_y.record( chunk[0] ) ;
- ok = false ;
- } else if( count_left != count_right ){
- track_mismatch.record( chunk[0] ) ;
- ok = false ;
- }
-
- }
-
- if(!ok){
- std::stringstream ss ;
- if( ! track_x.empty() ) ss << track_x.str() << ". " ;
- if( ! track_y.empty() ) ss << track_y.str() << ". " ;
- if( ! track_mismatch.empty() ) ss << track_mismatch.str() ;
-
- return no_because( ss.str() ) ;
- }
-
- if(ok && ignore_row_order) return yes();
-
- if( !ignore_row_order ){
- for( int i=0; i<nrows_x; i++){
- if( !visitors.equal( i, -i-1) ){
- return no_because( "Same row values, but different order" ) ;
- }
- }
- }
-
- return yes() ;
-}
-
-// [[Rcpp::export]]
-dplyr::BoolResult all_equal_data_frame( List args, Environment env ){
- int n = args.size() ;
- DataFrame x0 = Rf_eval( args[0], env) ;
- for( int i=1; i<n; i++){
- BoolResult test = equal_data_frame( x0, Rf_eval( args[i], env ) ) ;
- if( !test ) return test ;
- }
- return yes() ;
-}
-
-// [[Rcpp::export]]
-DataFrame union_data_frame( DataFrame x, DataFrame y){
- BoolResult compat = compatible_data_frame(x,y,true,true) ;
- if( !compat ){
- stop( "not compatible: %s", compat.why_not() );
- }
-
- typedef VisitorSetIndexSet<DataFrameJoinVisitors> Set ;
- DataFrameJoinVisitors visitors(x, y, x.names(), x.names(), true) ;
- Set set(visitors);
-
- train_insert( set, x.nrows() ) ;
- train_insert_right( set, y.nrows() ) ;
-
- return visitors.subset( set, x.attr("class") ) ;
-}
-
-// [[Rcpp::export]]
-DataFrame intersect_data_frame( DataFrame x, DataFrame y){
- BoolResult compat = compatible_data_frame(x,y,true,true) ;
- if( !compat ){
- stop( "not compatible: %s", compat.why_not() );
- }
- typedef VisitorSetIndexSet<DataFrameJoinVisitors> Set ;
-
- DataFrameJoinVisitors visitors(x, y, x.names(), x.names(), true ) ;
- Set set(visitors);
-
- train_insert( set, x.nrows() ) ;
-
- std::vector<int> indices ;
- int n_y = y.nrows() ;
- for( int i=0; i<n_y; i++) {
- Set::iterator it = set.find( -i-1 ) ;
- if( it != set.end() ){
- indices.push_back(*it) ;
- set.erase(it) ;
- }
- }
-
- return visitors.subset( indices, x.attr("class") ) ;
-}
-
-// [[Rcpp::export]]
-DataFrame setdiff_data_frame( DataFrame x, DataFrame y){
- BoolResult compat = compatible_data_frame(x,y,true,true) ;
- if( !compat ){
- stop( "not compatible: %s", compat.why_not() );
- }
-
- typedef VisitorSetIndexSet<DataFrameJoinVisitors> Set ;
- DataFrameJoinVisitors visitors(y, x, y.names(), y.names(), true ) ;
- Set set(visitors);
-
- train_insert( set, y.nrows() ) ;
-
- std::vector<int> indices ;
-
- int n_x = x.nrows() ;
- for( int i=0; i<n_x; i++) {
- if( !set.count(-i-1) ){
- set.insert(-i-1) ;
- indices.push_back(-i-1) ;
- }
- }
-
- return visitors.subset( indices, x.attr("class") ) ;
-}
-
-// [[Rcpp::export]]
-IntegerVector match_data_frame( DataFrame x, DataFrame y){
- if( !compatible_data_frame(x,y,true,true) )
- stop( "not compatible" );
-
- typedef VisitorSetIndexSet<DataFrameJoinVisitors> Set ;
- DataFrameJoinVisitors visitors(y, x, x.names(), x.names(), true ) ;
- Set set(visitors);
-
- train_insert( set, y.nrows() ) ;
-
- int n_x = x.nrows() ;
- IntegerVector res = no_init( n_x );
- for( int i=0; i<n_x; i++) {
- Set::iterator it = set.find( -i-1 );
- res[i] = ( it == set.end() ) ? NA_INTEGER : (*it+1) ;
- }
-
- return res ;
-}
-
-// [[Rcpp::export]]
-SEXP resolve_vars( List new_groups, CharacterVector names){
- int n = new_groups.size() ;
- for( int i=0; i<n; i++){
- List lazy = new_groups[i] ;
- Environment env = lazy[1] ;
- SEXP s = lazy[0] ;
-
- // expand column
- if( TYPEOF(s) == SYMSXP ){
-
- } else if( TYPEOF(s) == LANGSXP && CAR(s) == Rf_install("column") && Rf_length(s) == 2 ){
- s = extract_column( CADR(s), env ) ;
- } else {
- continue ;
- }
- // check that s is indeed in the data
-
- int pos = as<int>(r_match( CharacterVector::create(PRINTNAME(s)), names ));
- if( pos == NA_INTEGER){
- stop("unknown variable to group by : %s", CHAR(PRINTNAME(s))) ;
- }
- lazy[0] = s ;
- }
-
- return new_groups ;
-}
-
-// [[Rcpp::export]]
-DataFrame grouped_df_impl( DataFrame data, ListOf<Symbol> symbols, bool drop ){
- assert_all_white_list(data);
- DataFrame copy( shallow_copy(data));
- copy.attr("vars") = symbols ;
- copy.attr("drop") = drop ;
- if( !symbols.size() )
- stop("no variables to group by") ;
- return build_index_cpp(copy) ;
-}
-
-DataFrame build_index_cpp( DataFrame data ){
- ListOf<Symbol> symbols( data.attr( "vars" ) ) ;
-
- int nsymbols = symbols.size() ;
- CharacterVector vars(nsymbols) ;
- CharacterVector names = data.names() ;
- for( int i=0; i<nsymbols; i++){
- vars[i] = PRINTNAME(symbols[i]) ;
- }
- IntegerVector indx = r_match(vars, names ) ;
-
- for( int i=0; i<nsymbols; i++){
- int pos = indx[i] ;
- if( pos == NA_INTEGER){
- stop("unknown column '%s' ", CHAR(names[i]) ) ;
- }
-
- SEXP v = data[pos-1] ;
-
- if( !white_list(v) || TYPEOF(v) == VECSXP ){
- const char* name = vars[i] ;
- stop( "cannot group column %s, of class '%s'", name, get_single_class(v) ) ;
- }
- }
-
- DataFrameVisitors visitors(data, vars) ;
- ChunkIndexMap map( visitors ) ;
-
- train_push_back( map, data.nrows() ) ;
-
- DataFrame labels = DataFrameSubsetVisitors(data, vars).subset( map, "data.frame") ;
- int ngroups = labels.nrows() ;
- IntegerVector labels_order = OrderVisitors(labels).apply() ;
-
- labels = DataFrameSubsetVisitors(labels).subset(labels_order, "data.frame" ) ;
-
- List indices(ngroups) ;
- IntegerVector group_sizes = no_init( ngroups );
- int biggest_group = 0 ;
-
- ChunkIndexMap::const_iterator it = map.begin() ;
- std::vector<const std::vector<int>* > chunks(ngroups) ;
- for( int i=0; i<ngroups; i++, ++it){
- chunks[i] = &it->second ;
- }
-
- for( int i=0; i<ngroups; i++){
- int idx = labels_order[i] ;
- const std::vector<int>& chunk = *chunks[idx] ;
- indices[i] = chunk ;
- group_sizes[i] = chunk.size() ;
- biggest_group = std::max( biggest_group, (int)chunk.size() );
- }
-
- data.attr( "indices" ) = indices ;
- data.attr( "group_sizes") = group_sizes ;
- data.attr( "biggest_group_size" ) = biggest_group ;
- data.attr( "labels" ) = labels ;
- data.attr( "class" ) = CharacterVector::create("grouped_df", "tbl_df", "tbl", "data.frame") ;
- return data ;
-}
-
-DataFrame build_index_adj(DataFrame df, ListOf<Symbol> symbols ){
- int nsymbols = symbols.size() ;
- CharacterVector vars(nsymbols) ;
- for( int i=0; i<nsymbols; i++){
- vars[i] = PRINTNAME(symbols[i]) ;
- }
-
- DataFrameVisitors visitors(df, vars) ;
- std::vector<int> sizes ;
- int n = df.nrows() ;
-
- int i=0 ;
- while( i<n ){
- int start = i++ ;
- for( ; i<n && visitors.equal(i, start) ; i++) ;
- sizes.push_back(i-start) ;
- }
-
- n = sizes.size() ;
- List indices(n);
- IntegerVector first = no_init(n) ;
- int start = 0 ;
- int biggest_group = 0 ;
- for( int i=0; i<n; i++){
- first[i] = start ;
- int end = start + sizes[i] - 1 ;
- indices[i] = seq(start, end) ;
- start = end + 1 ;
- biggest_group = std::max( biggest_group, sizes[i]) ;
- }
-
- df.attr( "indices") = indices ;
- df.attr( "labels") = DataFrameSubsetVisitors(df, vars).subset(first, "data.frame") ;
- df.attr( "group_sizes") = sizes ;
- df.attr( "biggest_group_size") = biggest_group ;
- df.attr( "class" ) = CharacterVector::create("adj_grouped_df", "grouped_df", "tbl_df", "tbl", "data.frame") ;
- df.attr( "vars" ) = symbols ;
-
- return df ;
-}
-
-// [[Rcpp::export]]
-DataFrame grouped_df_adj_impl( DataFrame data, ListOf<Symbol> symbols, bool drop ){
- DataFrame copy( shallow_copy(data));
- copy.attr("vars") = symbols ;
- copy.attr("drop") = drop ;
- return build_index_adj(data, symbols) ;
-}
-
-typedef dplyr_hash_set<SEXP> SymbolSet ;
-
-inline SEXP check_filter_integer_result(SEXP tmp){
- if( TYPEOF(tmp) != INTSXP && TYPEOF(tmp) != REALSXP && TYPEOF(tmp) != LGLSXP ){
- stop( "slice condition does not evaluate to an integer or numeric vector. " ) ;
- }
- return tmp ;
-}
-
-class CountIndices {
-public:
- CountIndices( int nr_, IntegerVector test_ ) : nr(nr_), test(test_), n_pos(0), n_neg(0){
-
- for( int j=0; j<test.size(); j++){
- int i = test[j] ;
- if( i > 0 && i <= nr ) {
- n_pos++ ;
- } else if( i < 0 && i >= -nr ){
- n_neg++ ;
- }
- }
-
- if( n_neg > 0 && n_pos > 0 ){
- stop( "found %d positive indices and %d negative indices", n_pos, n_neg );
- }
-
- }
-
- inline bool is_positive() const { return n_pos > 0 ; }
- inline int get_n_positive() const { return n_pos; }
- inline int get_n_negative() const { return n_neg; }
-
-private:
- int nr ;
- IntegerVector test ;
- int n_pos ;
- int n_neg ;
-} ;
-
-SEXP slice_grouped(GroupedDataFrame gdf, const LazyDots& dots){
- typedef GroupedCallProxy<GroupedDataFrame, LazyGroupedSubsets> Proxy ;
-
- const DataFrame& data = gdf.data() ;
- const Lazy& lazy = dots[0] ;
- Environment env = lazy.env() ;
- CharacterVector names = data.names() ;
- SymbolSet set ;
- for( int i=0; i<names.size(); i++){
- set.insert( Rf_installChar( names[i] ) ) ;
- }
-
- // we already checked that we have only one expression
- Call call( lazy.expr() ) ;
-
- std::vector<int> indx ; indx.reserve(1000) ;
-
- IntegerVector g_test ;
- Proxy call_proxy( call, gdf, env ) ;
-
- int ngroups = gdf.ngroups() ;
- GroupedDataFrame::group_iterator git = gdf.group_begin() ;
- for( int i=0; i<ngroups; i++, ++git){
- SlicingIndex indices = *git ;
- int nr = indices.size() ;
- g_test = check_filter_integer_result( call_proxy.get( indices ) ) ;
- CountIndices counter( indices.size(), g_test ) ;
-
- if( counter.is_positive() ){
- // positive indexing
- int ntest = g_test.size() ;
- for( int j=0; j<ntest; j++){
- if( !( g_test[j] > nr || g_test[j] == NA_INTEGER ) ){
- indx.push_back( indices[g_test[j]-1] ) ;
- }
- }
- } else if( counter.get_n_negative() != 0){
- // negative indexing
- std::set<int> drop ;
- int n = g_test.size() ;
- for( int j=0; j<n; j++){
- if( g_test[j] != NA_INTEGER)
- drop.insert( -g_test[j] ) ;
- }
- int n_drop = drop.size() ;
- std::set<int>::const_iterator drop_it = drop.begin() ;
-
- int k = 0, j = 0 ;
- while( drop_it != drop.end() ){
- int next_drop = *drop_it - 1;
- while( j < next_drop ){
- indx.push_back( indices[j++] ) ;
- k++ ;
- }
- j++ ;
- ++drop_it ;
- }
- while( k < nr - n_drop){
- indx.push_back( indices[j++] ) ;
- k++ ;
- }
-
- }
- }
- DataFrame res = subset( data, indx, names, classes_grouped<GroupedDataFrame>() ) ;
- res.attr( "vars") = data.attr("vars") ;
- strip_index(res) ;
-
- return GroupedDataFrame(res).data() ;
-
-}
-
-SEXP slice_not_grouped( const DataFrame& df, const LazyDots& dots){
- CharacterVector names = df.names() ;
- SymbolSet set ;
- for( int i=0; i<names.size(); i++){
- set.insert( Rf_installChar( names[i] ) ) ;
- }
- const Lazy& lazy = dots[0] ;
- Call call( lazy.expr() );
- CallProxy proxy( call, df, lazy.env() ) ;
- int nr = df.nrows() ;
-
- IntegerVector test = check_filter_integer_result(proxy.eval()) ;
-
- int n = test.size() ;
-
- // count the positive and negatives
- CountIndices counter(nr, test) ;
-
- // just positives -> one based subset
- if( counter.is_positive() ){
- int n_pos = counter.get_n_positive() ;
- std::vector<int> idx(n_pos) ;
- int j=0 ;
- for( int i=0; i<n_pos; i++){
- while( test[j] > nr || test[j] == NA_INTEGER) j++ ;
- idx[i] = test[j++] - 1 ;
- }
-
- return subset( df, idx, df.names(), classes_not_grouped() ) ;
- }
-
- // special case where only NA
- if( counter.get_n_negative() == 0){
- std::vector<int> indices ;
- DataFrame res = subset( df, indices, df.names(), classes_not_grouped() ) ;
- return res ;
- }
-
- // just negatives (out of range is dealt with early in CountIndices).
- std::set<int> drop ;
- for( int i=0; i<n; i++){
- if( test[i] != NA_INTEGER )
- drop.insert( -test[i] ) ;
- }
- int n_drop = drop.size() ;
- std::vector<int> indices(nr - n_drop) ;
- std::set<int>::const_iterator drop_it = drop.begin() ;
-
- int i = 0, j = 0 ;
- while( drop_it != drop.end() ){
- int next_drop = *drop_it - 1;
- while( j < next_drop ){
- indices[i++] = j++ ;
- }
- j++ ;
- ++drop_it ;
- }
- while( i < nr - n_drop){
- indices[i++] = j++ ;
- }
-
- DataFrame res = subset( df, indices, df.names(), classes_not_grouped() ) ;
- return res ;
-
-}
-
-// [[Rcpp::export]]
-SEXP slice_impl( DataFrame df, LazyDots dots){
- if( dots.size() == 0 ) return df ;
- if( dots.size() != 1 )
- stop( "slice only accepts one expression" );
- if( is<GroupedDataFrame>(df) ){
- return slice_grouped( GroupedDataFrame(df), dots ) ;
- } else {
- return slice_not_grouped(df, dots ) ;
- }
-}
-
-template <typename Data>
-SEXP structure_mutate( const NamedListAccumulator<Data>& accumulator, const DataFrame& df, CharacterVector classes){
- List res = accumulator ;
- res.attr("class") = classes ;
- set_rownames( res, df.nrows() ) ;
- res.attr( "vars") = df.attr("vars") ;
- res.attr( "labels" ) = df.attr("labels" );
- res.attr( "index") = df.attr("index") ;
- res.attr( "indices" ) = df.attr("indices" ) ;
- res.attr( "drop" ) = df.attr("drop" ) ;
- res.attr( "group_sizes" ) = df.attr("group_sizes" ) ;
- res.attr( "biggest_group_size" ) = df.attr("biggest_group_size" ) ;
-
- return res ;
-}
-
-void check_not_groups(const CharacterVector& result_names, const RowwiseDataFrame& gdf){}
-void check_not_groups(const LazyDots& dots, const RowwiseDataFrame& gdf){}
-
-void check_not_groups(const CharacterVector& result_names, const GroupedDataFrame& gdf){
- int n = result_names.size() ;
- for( int i=0; i<n; i++){
- if( gdf.has_group( result_names[i] ) )
- stop( "cannot modify grouping variable" ) ;
- }
-}
-void check_not_groups(const LazyDots& dots, const GroupedDataFrame& gdf){
- int n = dots.size() ;
- for( int i=0; i<n; i++){
- if( gdf.has_group( dots[i].name() ) )
- stop( "cannot modify grouping variable" ) ;
- }
-}
-
-
-SEXP mutate_not_grouped(DataFrame df, const LazyDots& dots){
- int nexpr = dots.size() ;
- int nrows = df.nrows() ;
-
- NamedListAccumulator<DataFrame> accumulator ;
- int nvars = df.size() ;
- if( nvars ){
- CharacterVector df_names = df.names() ;
- for( int i=0; i<nvars; i++){
- accumulator.set( df_names[i], df[i] ) ;
- }
- }
-
- CallProxy call_proxy(df) ;
- List results(nexpr) ;
-
- for( int i=0; i<nexpr; i++){
- Rcpp::checkUserInterrupt() ;
- const Lazy& lazy = dots[i] ;
-
- Shield<SEXP> call_( lazy.expr() ) ; SEXP call = call_ ;
- SEXP name = lazy.name() ;
- Environment env = lazy.env() ;
- call_proxy.set_env(env) ;
-
- if( TYPEOF(call) == SYMSXP ){
- if(call_proxy.has_variable(call)){
- results[i] = call_proxy.get_variable(PRINTNAME(call)) ;
- } else {
- results[i] = shared_SEXP(env.find(CHAR(PRINTNAME(call)))) ;
- }
- } else if( TYPEOF(call) == LANGSXP ){
- call_proxy.set_call( call );
- results[i] = call_proxy.eval() ;
- } else if( Rf_length(call) == 1 ){
- boost::scoped_ptr<Gatherer> gather( constant_gatherer( call, nrows ) );
- results[i] = gather->collect() ;
- } else if( Rf_isNull(call)) {
- accumulator.rm(name) ;
- continue ;
- } else {
- stop( "cannot handle" ) ;
- }
-
- check_supported_type(results[i], name) ;
-
- if( Rf_inherits(results[i], "POSIXlt") ){
- stop("`mutate` does not support `POSIXlt` results");
- }
- int n_res = Rf_length(results[i]) ;
- if( n_res == nrows ){
- // ok
- } else if( n_res == 1 ){
- // recycle
- boost::scoped_ptr<Gatherer> gather( constant_gatherer( results[i] , df.nrows() ) );
- results[i] = gather->collect() ;
- } else {
- stop( "wrong result size (%d), expected %d or 1", n_res, nrows ) ;
- }
-
- call_proxy.input( name, results[i] ) ;
- accumulator.set( name, results[i] );
- }
- List res = structure_mutate(accumulator, df, classes_not_grouped() ) ;
-
- return res ;
-}
-
-template <typename Data, typename Subsets>
-SEXP mutate_grouped(const DataFrame& df, const LazyDots& dots){
- // special 0 rows case
- if( df.nrows() == 0 ){
- DataFrame res = mutate_not_grouped(df, dots) ;
- res.attr("vars") = df.attr("vars") ;
- res.attr("class") = df.attr("class") ;
- return Data(res).data() ;
- }
-
- typedef GroupedCallProxy<Data, Subsets> Proxy;
- Data gdf(df);
- int nexpr = dots.size() ;
- check_not_groups(dots, gdf);
-
- Proxy proxy(gdf) ;
-
- NamedListAccumulator<Data> accumulator ;
- int ncolumns = df.size() ;
- CharacterVector column_names = df.names() ;
- for( int i=0; i<ncolumns; i++){
- accumulator.set( column_names[i], df[i] ) ;
- }
-
- List variables(nexpr) ;
- for( int i=0; i<nexpr; i++){
- Rcpp::checkUserInterrupt() ;
- const Lazy& lazy = dots[i] ;
-
- Environment env = lazy.env() ;
- Shield<SEXP> call_( lazy.expr() );
- SEXP call = call_ ;
- SEXP name = lazy.name() ;
- proxy.set_env( env ) ;
-
- if( TYPEOF(call) == SYMSXP ){
- if(proxy.has_variable(call)){
- SEXP variable = variables[i] = proxy.get_variable( PRINTNAME(call) ) ;
- proxy.input( name, variable ) ;
- accumulator.set( name, variable) ;
- } else {
- SEXP v = env.find(CHAR(PRINTNAME(call))) ;
- check_supported_type(v, name) ;
- if( Rf_isNull(v) ){
- stop( "unknown variable: %s", CHAR(PRINTNAME(call)) );
- } else if( Rf_length(v) == 1){
- boost::scoped_ptr<Gatherer> rep( constant_gatherer(v, gdf.nrows() ) );
- SEXP variable = variables[i] = rep->collect() ;
- proxy.input( name, variable ) ;
- accumulator.set( name, variable) ;
- } else {
- int n = Rf_length(v) ;
- bool test = all( gdf.get_group_sizes() == n ).is_true() ;
- if( !test ){
- stop( "impossible to replicate vector of size %d", n );
- }
-
- boost::scoped_ptr<Replicator> rep( replicator<Data>(v, gdf) ) ;
- SEXP variable = variables[i] = rep->collect() ;
- proxy.input( name, variable ) ;
- accumulator.set( name, variable) ;
- }
- }
-
- } else if(TYPEOF(call) == LANGSXP){
- proxy.set_call( call );
- boost::scoped_ptr<Gatherer> gather( gatherer<Data, Subsets>( proxy, gdf, name ) );
- SEXP variable = variables[i] = gather->collect() ;
- proxy.input( name, variable ) ;
- accumulator.set( name, variable) ;
- } else if(Rf_length(call) == 1) {
- boost::scoped_ptr<Gatherer> gather( constant_gatherer( call, gdf.nrows() ) );
- SEXP variable = variables[i] = gather->collect() ;
- proxy.input( name, variable ) ;
- accumulator.set( name, variable) ;
- } else if( Rf_isNull(call) ){
- accumulator.rm(name) ;
- continue ;
- } else {
- stop( "cannot handle" ) ;
- }
- }
-
- return structure_mutate(accumulator, df, df.attr("class") );
-}
-
-
-// [[Rcpp::export]]
-SEXP mutate_impl( DataFrame df, LazyDots dots){
- if( dots.size() == 0 ) return df ;
- check_valid_colnames(df) ;
- if(is<RowwiseDataFrame>(df) ) {
- return mutate_grouped<RowwiseDataFrame, LazyRowwiseSubsets>( df, dots);
- } else if( is<GroupedDataFrame>( df ) ){
- return mutate_grouped<GroupedDataFrame, LazyGroupedSubsets>( df, dots);
- } else {
- return mutate_not_grouped( df, dots) ;
- }
-}
-
-// [[Rcpp::export]]
-IntegerVector order_impl( List args, Environment env ){
- int nargs = args.size() ;
- SEXP tmp ;
- List variables(nargs) ;
- LogicalVector ascending(nargs) ;
- for(int i=0; i<nargs; i++){
- tmp = args[i] ;
- if( TYPEOF(tmp) == LANGSXP && CAR(tmp) == Rf_install("desc") ){
- variables[i] = Rf_eval( CAR(CDR(tmp) ), env ) ;
- ascending[i] = false ;
- } else{
- variables[i] = Rf_eval( tmp, env );
- ascending[i] = true ;
- }
- }
- OrderVisitors o(variables,ascending, nargs) ;
- IntegerVector res = o.apply() ;
- res = res + 1 ;
- return res ;
-}
-
-// [[Rcpp::export]]
-DataFrame sort_impl( DataFrame data ){
- IntegerVector index = OrderVisitors(data).apply() ;
- return DataFrameSubsetVisitors( data, data.names() ).subset(index, "data.frame" ) ;
-}
-
-// [[Rcpp::export]]
-IntegerVector group_size_grouped_cpp( GroupedDataFrame gdf ){
- return Count().process(gdf) ;
-}
-
-// [[Rcpp::export]]
-SEXP n_distinct_multi( List variables, bool na_rm = false){
- if( variables.length() == 0 ) {
- stop("need at least one column for n_distinct()");
- }
-
- MultipleVectorVisitors visitors(variables) ;
- SlicingIndex everything(0, visitors.nrows()) ;
- if( na_rm ){
- Count_Distinct_Narm<MultipleVectorVisitors> counter(visitors) ;
- return counter.process(everything) ;
- } else {
- Count_Distinct<MultipleVectorVisitors> counter(visitors) ;
- return counter.process(everything) ;
- }
-}
-
-// [[Rcpp::export]]
-DataFrame as_regular_df(DataFrame df){
- DataFrame copy( shallow_copy(df));
- SET_ATTRIB(copy, strip_group_attributes(df)) ;
- SET_OBJECT(copy, OBJECT(df)) ;
- copy.attr("class") = CharacterVector::create("data.frame") ;
- return copy ;
-}
-
-// [[Rcpp::export]]
-DataFrame ungroup_grouped_df( DataFrame df){
- DataFrame copy( shallow_copy(df));
- SET_ATTRIB(copy, strip_group_attributes(df)) ;
- return copy ;
-}
-
-// [[Rcpp::export]]
-std::vector<std::vector<int> > split_indices(IntegerVector group, int groups) {
- std::vector<std::vector<int> > ids(groups);
-
- int n = group.size();
- for (int i = 0; i < n; ++i) {
- ids[group[i] - 1].push_back(i + 1);
- }
-
- return ids;
-}
-
-
-// simple internal debugging function to access the gp part of the SEXP
-// only meant for internal use in dplyr debugging
-
-// [[Rcpp::export]]
-unsigned short gp( SEXP x){
- return reinterpret_cast<sxpinfo_struct*>(x)->gp ;
-}
diff --git a/src/encoding.cpp b/src/encoding.cpp
new file mode 100644
index 0000000..87ab908
--- /dev/null
+++ b/src/encoding.cpp
@@ -0,0 +1,59 @@
+#include "pch.h"
+#include <dplyr/main.h>
+
+#include <tools/encoding.h>
+#include <tools/utils.h>
+
+namespace dplyr {
+
+R_xlen_t get_first_reencode_pos(const CharacterVector& x) {
+ R_xlen_t len = x.length();
+ for (R_xlen_t i = 0; i < len; ++i) {
+ SEXP xi = x[i];
+ if (xi != NA_STRING && !IS_ASCII(xi) && !IS_UTF8(xi)) {
+ return i;
+ }
+ }
+
+ return len;
+}
+
+CharacterVector reencode_char(SEXP x) {
+ if (Rf_isFactor(x)) return reencode_factor(x);
+
+ CharacterVector ret(x);
+ R_xlen_t first = get_first_reencode_pos(ret);
+ if (first >= ret.length()) return ret;
+
+ ret = clone(ret);
+
+ R_xlen_t len = ret.length();
+ for (R_xlen_t i = first; i < len; ++i) {
+ SEXP reti = ret[i];
+ if (reti != NA_STRING && !IS_ASCII(reti) && !IS_UTF8(reti)) {
+ ret[i] = String(Rf_translateCharUTF8(reti), CE_UTF8);
+ }
+ }
+
+ return ret;
+}
+
+CharacterVector reencode_factor(IntegerVector x) {
+ CharacterVector levels(reencode_char(get_levels(x)));
+ CharacterVector ret(x.length());
+
+ R_xlen_t nlevels = levels.length();
+
+ R_xlen_t len = x.length();
+ for (R_xlen_t i = 0; i < len; ++i) {
+ int xi = x[i];
+ if (xi <= 0 || xi > nlevels)
+ ret[i] = NA_STRING;
+ else
+ ret[i] = levels[xi - 1];
+ }
+
+ return ret;
+}
+
+}
diff --git a/src/filter.cpp b/src/filter.cpp
index d38f62e..71476f2 100644
--- a/src/filter.cpp
+++ b/src/filter.cpp
@@ -1,295 +1,108 @@
-#include <dplyr.h>
+#include "pch.h"
+#include <dplyr/main.h>
-using namespace Rcpp ;
-using namespace dplyr ;
+#include <tools/hash.h>
+#include <tools/Quosure.h>
+#include <tools/utils.h>
+#include <tools/SymbolString.h>
-typedef dplyr_hash_set<SEXP> SymbolSet ;
+#include <dplyr/GroupedDataFrame.h>
-namespace dplyr {
- void strip_index(DataFrame x) {
- x.attr("indices") = R_NilValue ;
- x.attr("group_sizes") = R_NilValue ;
- x.attr("biggest_group_size") = R_NilValue ;
- x.attr("labels") = R_NilValue ;
- }
-}
-
-inline SEXP empty_subset( const DataFrame& df, CharacterVector columns, CharacterVector classes ){
- DataFrame res = DataFrameSubsetVisitors(df, columns).subset( EmptySubset(), classes) ;
- strip_index(res);
- return res;
-}
-
-SEXP assert_correct_filter_subcall(SEXP x, const SymbolSet& set, const Environment& env){
- switch(TYPEOF(x)){
- case LGLSXP: return x;
- case LANGSXP: return x ;
- case SYMSXP:
- {
- if( set.count(x) ) return x ;
-
- // look in the environment
- SEXP var = PROTECT( Rf_findVar( x, env ) ) ;
- SEXP res = Rf_duplicate(var) ;
- UNPROTECT(1) ;
- if( res == R_UnboundValue ){
- if( x == Rf_install("T") ){
- return Rf_ScalarLogical(TRUE) ;
- } else if( x == Rf_install("F") ){
- return Rf_ScalarLogical(FALSE) ;
- }
- stop( "unknown column : %s", CHAR(PRINTNAME(x)) );
- }
- return res ;
- }
- default:
- break ;
- }
- stop("incompatible expression in filter") ;
- return x ; // never happens
-}
-
-SEXP and_calls( const LazyDots& dots, const SymbolSet& set, const Environment& env ){
- int ncalls = dots.size() ;
- if( !ncalls ) {
- stop("incompatible input") ;
- }
- Shield<SEXP> call_( dots[0].expr() ) ;
-
- RObject res( assert_correct_filter_subcall(call_, set, env) ) ;
-
- SEXP and_symbol = Rf_install( "&" ) ;
- for( int i=1; i<ncalls; i++){
- Shield<SEXP> call( dots[i].expr() ) ;
- res = Rcpp_lang3( and_symbol, res, assert_correct_filter_subcall(call, set, env) ) ;
- }
- return res ;
-}
+#include <dplyr/Result/LazyRowwiseSubsets.h>
+#include <dplyr/Result/GroupedCallProxy.h>
+#include <dplyr/Result/CallProxy.h>
-void check_filter_result(const LogicalVector& test, int n){
- if( test.size() != n ) {
- stop( "incorrect length (%d), expecting: %d", test.size(), n );
- }
-}
+#include <dplyr/bad.h>
-inline SEXP check_filter_logical_result(SEXP tmp){
- if( TYPEOF(tmp) != LGLSXP ){
- stop( "filter condition does not evaluate to a logical vector. " ) ;
- }
- return tmp ;
-}
+using namespace Rcpp;
+using namespace dplyr;
-template <typename Data>
-inline DataFrame grouped_subset( const Data& gdf, const LogicalVector& test, const CharacterVector& names, CharacterVector classes){
- DataFrame data = gdf.data() ;
- DataFrame res = subset( data, test, names, classes) ;
- res.attr("vars") = data.attr("vars") ;
+inline
+SEXP empty_subset(const DataFrame& df, const CharacterVector& classes) {
+ DataFrame res = DataFrameSubsetVisitors(df).subset(EmptySubset(), classes);
strip_index(res);
- return Data(res).data() ;
+ return res;
}
-template <typename Data, typename Subsets>
-DataFrame filter_grouped_single_env( const Data& gdf, const LazyDots& dots){
- typedef GroupedCallProxy<Data, Subsets> Proxy ;
- Environment env = dots[0].env() ;
-
- const DataFrame& data = gdf.data() ;
- CharacterVector names = data.names() ;
- SymbolSet set ;
- for( int i=0; i<names.size(); i++){
- set.insert( Rf_installChar( names[i] ) ) ;
- }
-
- // a, b, c -> a & b & c
- Call call( and_calls( dots, set, env ) ) ;
-
- int nrows = data.nrows() ;
- LogicalVector test(nrows, TRUE);
-
- LogicalVector g_test ;
- Proxy call_proxy( call, gdf, env ) ;
-
- int ngroups = gdf.ngroups() ;
- typename Data::group_iterator git = gdf.group_begin() ;
- for( int i=0; i<ngroups; i++, ++git){
- SlicingIndex indices = *git ;
- int chunk_size = indices.size() ;
-
- g_test = check_filter_logical_result( call_proxy.get( indices ) ) ;
- if( g_test.size() == 1 ){
- int val = g_test[0] == TRUE ;
- for( int j=0; j<chunk_size; j++){
- test[ indices[j] ] = val ;
- }
- } else {
- check_filter_result(g_test, chunk_size ) ;
- for( int j=0; j<chunk_size; j++){
- if( g_test[j] != TRUE ) test[ indices[j] ] = FALSE ;
- }
- }
- }
- return grouped_subset<Data>( gdf, test, names, classes_grouped<Data>() ) ;
+inline
+void check_result_length(const LogicalVector& test, int n) {
+ if (test.size() != n) {
+ stop("Result must have length %d, not %d", n, test.size());
+ }
}
-
-// version of grouped filter when contributions to ... come from several environment
-template <typename Data, typename Subsets>
-DataFrame filter_grouped_multiple_env( const Data& gdf, const LazyDots& dots){
- const DataFrame& data = gdf.data() ;
- CharacterVector names = data.names() ;
- SymbolSet set ;
- for( int i=0; i<names.size(); i++){
- set.insert( Rf_installChar( names[i] ) ) ;
- }
-
- int nrows = data.nrows() ;
- LogicalVector test(nrows, TRUE);
-
- LogicalVector g_test ;
-
- for( int k=0; k<dots.size(); k++){
- Rcpp::checkUserInterrupt() ;
- const Lazy& lazy = dots[k] ;
-
- Call call( lazy.expr() ) ;
- GroupedCallProxy<Data, Subsets> call_proxy( call, gdf, lazy.env() ) ;
- int ngroups = gdf.ngroups() ;
- typename Data::group_iterator git = gdf.group_begin() ;
- for( int i=0; i<ngroups; i++, ++git){
- SlicingIndex indices = *git ;
- int chunk_size = indices.size() ;
-
- g_test = check_filter_logical_result(call_proxy.get( indices ));
- if( g_test.size() == 1 ){
- if( g_test[0] != TRUE ){
- for( int j=0; j<chunk_size; j++){
- test[indices[j]] = FALSE ;
- }
- }
- } else {
- check_filter_result(g_test, chunk_size ) ;
- for( int j=0; j<chunk_size; j++){
- if( g_test[j] != TRUE ){
- test[ indices[j] ] = FALSE ;
- }
- }
- }
- }
- }
- return grouped_subset<Data>( gdf, test, names, classes_grouped<Data>() ) ;
+inline
+SEXP check_result_lgl_type(SEXP tmp) {
+ if (TYPEOF(tmp) != LGLSXP) {
+ bad_pos_arg(2, "filter condition does not evaluate to a logical vector");
+ }
+ return tmp;
}
-template <typename Data, typename Subsets>
-DataFrame filter_grouped( const Data& gdf, const LazyDots& dots){
- if( dots.single_env() ){
- return filter_grouped_single_env<Data, Subsets>(gdf, dots) ;
+template <typename SlicedTibble, typename Subsets>
+DataFrame filter_grouped(const SlicedTibble& gdf, const NamedQuosure& quo) {
+ typedef GroupedCallProxy<SlicedTibble, Subsets> Proxy;
+ const DataFrame& data = gdf.data();
+
+ LogicalVector test(data.nrows(), TRUE);
+ LogicalVector g_test;
+ Proxy call_proxy(quo.expr(), gdf, quo.env());
+
+ int ngroups = gdf.ngroups();
+ typename SlicedTibble::group_iterator git = gdf.group_begin();
+ for (int i = 0; i < ngroups; i++, ++git) {
+ const SlicingIndex& indices = *git;
+ int chunk_size = indices.size();
+
+ g_test = check_result_lgl_type(call_proxy.get(indices));
+ if (g_test.size() == 1) {
+ int val = g_test[0] == TRUE;
+ for (int j = 0; j < chunk_size; j++) {
+ test[indices[j]] = val;
+ }
} else {
- return filter_grouped_multiple_env<Data, Subsets>(gdf, dots) ;
+ check_result_length(g_test, chunk_size);
+ for (int j = 0; j < chunk_size; j++) {
+ if (g_test[j] != TRUE) test[ indices[j] ] = FALSE;
+ }
}
-}
+ }
-bool combine_and(LogicalVector& test, const LogicalVector& test2){
- int n = test.size() ;
- if(n == 1) {
- test = test2 ;
- } else {
- int n2 = test2.size() ;
- if( n2 == 1 ){
- if( !test2[0] ){
- return true ;
- }
- } else if( n2 == n) {
- for( int i=0; i<n; i++){
- test[i] = test[i] && test2[i] ;
- }
- } else {
- stop( "incompatible sizes" ) ;
- }
- }
- return false;
+ // Subset the grouped data frame
+ DataFrame res = subset(data, test, data.names(), classes_grouped<SlicedTibble>());
+ copy_vars(res, data);
+ strip_index(res);
+ return SlicedTibble(res).data();
}
-DataFrame filter_not_grouped( DataFrame df, const LazyDots& dots){
- CharacterVector names = df.names() ;
- SymbolSet set ;
- for( int i=0; i<names.size(); i++){
- set.insert( Rf_installChar( names[i] ) ) ;
- }
- if( dots.single_env() ){
- Environment env = dots[0].env() ;
- // a, b, c -> a & b & c
- Shield<SEXP> call( and_calls( dots, set, env ) ) ;
-
- // replace the symbols that are in the data frame by vectors from the data frame
- // and evaluate the expression
- CallProxy proxy( (SEXP)call, df, env ) ;
- LogicalVector test = check_filter_logical_result(proxy.eval()) ;
+DataFrame filter_ungrouped(DataFrame df, const NamedQuosure& quo) {
+ CallProxy proxy(quo.expr(), df, quo.env());
+ LogicalVector test = check_result_lgl_type(proxy.eval());
- if( test.size() == 1){
- if( test[0] == TRUE ){
- return df ;
- } else {
- return empty_subset(df, df.names(), classes_not_grouped()) ;
- }
- } else {
- check_filter_result(test, df.nrows());
- return subset(df, test, classes_not_grouped() ) ;
- }
+ if (test.size() == 1) {
+ if (test[0] == TRUE) {
+ return df;
} else {
- int nargs = dots.size() ;
-
- Call call(dots[0].expr());
- CallProxy first_proxy(call, df, dots[0].env() ) ;
- LogicalVector test = check_filter_logical_result(first_proxy.eval()) ;
- if( test.size() == 1 ) {
- if( !test[0] ){
- return empty_subset(df, df.names(), classes_not_grouped() ) ;
- }
- } else {
- check_filter_result(test, df.nrows());
- }
-
- for( int i=1; i<nargs; i++){
- Rcpp::checkUserInterrupt() ;
-
- Call call( dots[i].expr() ) ;
- CallProxy proxy(call, df, dots[i].env() ) ;
- LogicalVector test2 = check_filter_logical_result(proxy.eval()) ;
- if( combine_and(test, test2) ){
- return empty_subset(df, df.names(), classes_not_grouped() ) ;
- }
- }
-
- DataFrame res = subset( df, test, classes_not_grouped() ) ;
- return res ;
+ return empty_subset(df, classes_not_grouped());
}
+ } else {
+ check_result_length(test, df.nrows());
+ return subset(df, test, classes_not_grouped());
+ }
}
// [[Rcpp::export]]
-SEXP filter_impl( DataFrame df, LazyDots dots){
- if( df.nrows() == 0 || Rf_isNull(df) ) {
- return df ;
- }
- check_valid_colnames(df) ;
- assert_all_white_list(df) ;
-
- if( dots.size() == 0 ) return df ;
-
- // special case
- if( dots.size() == 1 && TYPEOF(dots[0].expr()) == LGLSXP){
- LogicalVector what = dots[0].expr() ;
- if( what.size() == 1 ){
- if( what[0] == TRUE ){
- return df ;
- } else {
- return empty_subset( df, df.names(), is<GroupedDataFrame>(df) ? classes_grouped<GroupedDataFrame>() : classes_not_grouped() ) ;
- }
- }
- }
- if( is<GroupedDataFrame>( df ) ){
- return filter_grouped<GroupedDataFrame, LazyGroupedSubsets>( GroupedDataFrame(df), dots);
- } else if( is<RowwiseDataFrame>(df) ){
- return filter_grouped<RowwiseDataFrame, LazyRowwiseSubsets>( RowwiseDataFrame(df), dots);
- } else {
- return filter_not_grouped( df, dots ) ;
- }
+SEXP filter_impl(DataFrame df, NamedQuosure quo) {
+ if (df.nrows() == 0 || Rf_isNull(df)) {
+ return df;
+ }
+ check_valid_colnames(df);
+ assert_all_white_list(df);
+
+ if (is<GroupedDataFrame>(df)) {
+ return filter_grouped<GroupedDataFrame, LazyGroupedSubsets>(GroupedDataFrame(df), quo);
+ } else if (is<RowwiseDataFrame>(df)) {
+ return filter_grouped<RowwiseDataFrame, LazyRowwiseSubsets>(RowwiseDataFrame(df), quo);
+ } else {
+ return filter_ungrouped(df, quo);
+ }
}
diff --git a/src/group_by.cpp b/src/group_by.cpp
new file mode 100644
index 0000000..c04f126
--- /dev/null
+++ b/src/group_by.cpp
@@ -0,0 +1,43 @@
+#include "pch.h"
+#include <dplyr/main.h>
+
+#include <tools/match.h>
+#include <tools/utils.h>
+
+#include <dplyr/tbl_cpp.h>
+#include <dplyr/Groups.h>
+
+using namespace Rcpp;
+using namespace dplyr;
+
+// [[Rcpp::export]]
+DataFrame grouped_df_impl(DataFrame data, SymbolVector symbols, bool drop) {
+ assert_all_white_list(data);
+ DataFrame copy(shallow_copy(data));
+ set_vars(copy, symbols);
+ copy.attr("drop") = drop;
+ if (!symbols.size())
+ stop("no variables to group by");
+ return build_index_cpp(copy);
+}
+
+// [[Rcpp::export]]
+DataFrame as_regular_df(DataFrame df) {
+ DataFrame copy(shallow_copy(df));
+ SET_ATTRIB(copy, strip_group_attributes(df));
+ SET_OBJECT(copy, OBJECT(df));
+ set_class(copy, CharacterVector::create("data.frame"));
+ return copy;
+}
+
+// [[Rcpp::export]]
+DataFrame ungroup_grouped_df(DataFrame df) {
+ DataFrame copy(shallow_copy(df));
+ SET_ATTRIB(copy, strip_group_attributes(df));
+ return copy;
+}
+
+// [[Rcpp::export]]
+SEXP test_grouped_df(DataFrame data) {
+ return GroupedDataFrame(data).data();
+}
diff --git a/src/group_indices.cpp b/src/group_indices.cpp
index 044d9e7..d087034 100644
--- a/src/group_indices.cpp
+++ b/src/group_indices.cpp
@@ -1,74 +1,135 @@
-#include <dplyr.h>
+#include "pch.h"
+#include <dplyr/main.h>
-using namespace Rcpp ;
-using namespace dplyr ;
+#include <tools/match.h>
+
+#include <dplyr/white_list.h>
+
+#include <dplyr/GroupedDataFrame.h>
+
+#include <dplyr/Order.h>
+
+#include <dplyr/Result/Count.h>
+
+#include <dplyr/train.h>
+
+#include <dplyr/bad.h>
+
+using namespace Rcpp;
+using namespace dplyr;
// [[Rcpp::export]]
-IntegerVector grouped_indices_grouped_df_impl( GroupedDataFrame gdf ){
- int n=gdf.nrows() ;
- IntegerVector res = no_init(n) ;
- int ngroups = gdf.ngroups() ;
- GroupedDataFrameIndexIterator it = gdf.group_begin() ;
- for(int i=0; i<ngroups; i++, ++it){
- SlicingIndex index = *it ;
- int n_index = index.size() ;
- for( int j=0; j<n_index; j++){
- res[ index[j] ] = i + 1 ;
- }
+IntegerVector grouped_indices_grouped_df_impl(GroupedDataFrame gdf) {
+ int n = gdf.nrows();
+ IntegerVector res = no_init(n);
+ int ngroups = gdf.ngroups();
+ GroupedDataFrameIndexIterator it = gdf.group_begin();
+ for (int i = 0; i < ngroups; i++, ++it) {
+ const SlicingIndex& index = *it;
+ int n_index = index.size();
+ for (int j = 0; j < n_index; j++) {
+ res[ index[j] ] = i + 1;
}
- return res ;
+ }
+ return res;
}
// [[Rcpp::export]]
-IntegerVector grouped_indices_impl( DataFrame data, ListOf<Symbol> symbols ){
- int nsymbols = symbols.size() ;
- if( nsymbols == 0 )
- return rep(1, data.nrows()) ;
- CharacterVector vars(nsymbols) ;
- for( int i=0; i<nsymbols; i++){
- vars[i] = PRINTNAME(symbols[i]) ;
-
- const char* name = vars[i] ;
- SEXP v ;
- try{
- v = data[name] ;
- } catch(...){
- stop( "unknown column '%s'", name ) ;
- }
- if( !white_list(v) || TYPEOF(v) == VECSXP ){
- stop( "cannot group column %s, of class '%s'", name, get_single_class(v) ) ;
- }
+IntegerVector group_size_grouped_cpp(GroupedDataFrame gdf) {
+ return Count().process(gdf);
+}
+
+DataFrame build_index_cpp(DataFrame data) {
+ SymbolVector vars(get_vars(data));
+ const int nvars = vars.size();
+
+ CharacterVector names = data.names();
+ IntegerVector indx = vars.match_in_table(names);
+
+ for (int i = 0; i < nvars; ++i) {
+ int pos = indx[i];
+ if (pos == NA_INTEGER) {
+ bad_col(vars[i], "is unknown");
}
- DataFrameVisitors visitors(data, vars) ;
- ChunkIndexMap map( visitors ) ;
- int n = data.nrows() ;
- train_push_back( map, n ) ;
+ SEXP v = data[pos - 1];
- DataFrame labels = DataFrameSubsetVisitors(data, vars).subset( map, "data.frame") ;
- IntegerVector labels_order = OrderVisitors(labels).apply() ;
+ if (!white_list(v) || TYPEOF(v) == VECSXP) {
+ bad_col(vars[i], "can't be used as a grouping variable because it's a {type}",
+ _["type"] = get_single_class(v));
+ }
+ }
- labels = DataFrameSubsetVisitors(labels).subset(labels_order, "data.frame" ) ;
+ DataFrameVisitors visitors(data, vars);
+ ChunkIndexMap map(visitors);
- int ngroups = map.size() ;
+ train_push_back(map, data.nrows());
- IntegerVector res = no_init(n) ;
+ DataFrame labels = DataFrameSubsetVisitors(data, vars).subset(map, "data.frame");
+ int ngroups = labels.nrows();
+ IntegerVector labels_order = OrderVisitors(labels).apply();
- std::vector<const std::vector<int>* > chunks(ngroups) ;
- ChunkIndexMap::const_iterator it = map.begin() ;
- for( int i=0; i<ngroups; i++, ++it){
- chunks[i] = &it->second ;
- }
+ labels = DataFrameSubsetVisitors(labels).subset(labels_order, "data.frame");
+
+ List indices(ngroups);
+ IntegerVector group_sizes = no_init(ngroups);
+ int biggest_group = 0;
+
+ ChunkIndexMap::const_iterator it = map.begin();
+ std::vector<const std::vector<int>* > chunks(ngroups);
+ for (int i = 0; i < ngroups; i++, ++it) {
+ chunks[i] = &it->second;
+ }
+
+ for (int i = 0; i < ngroups; i++) {
+ int idx = labels_order[i];
+ const std::vector<int>& chunk = *chunks[idx];
+ indices[i] = chunk;
+ group_sizes[i] = chunk.size();
+ biggest_group = std::max(biggest_group, (int)chunk.size());
+ }
+
+ data.attr("indices") = indices;
+ data.attr("group_sizes") = group_sizes;
+ data.attr("biggest_group_size") = biggest_group;
+ data.attr("labels") = labels;
+ set_class(data, CharacterVector::create("grouped_df", "tbl_df", "tbl", "data.frame"));
+ return data;
+}
+
+void strip_index(DataFrame x) {
+ x.attr("indices") = R_NilValue;
+ x.attr("group_sizes") = R_NilValue;
+ x.attr("biggest_group_size") = R_NilValue;
+ x.attr("labels") = R_NilValue;
+}
+
+SEXP strip_group_attributes(SEXP df) {
+ Shield<SEXP> attribs(Rf_cons(dplyr::classes_not_grouped(), R_NilValue));
+ SET_TAG(attribs, Rf_install("class"));
- for( int i=0; i<ngroups; i++){
- int idx = labels_order[i] ;
- const std::vector<int>& v = *chunks[idx] ;
+ SEXP p = ATTRIB(df);
+ std::vector<SEXP> black_list(8);
+ black_list[0] = Rf_install("indices");
+ black_list[1] = Rf_install("vars");
+ black_list[2] = Rf_install("index");
+ black_list[3] = Rf_install("labels");
+ black_list[4] = Rf_install("drop");
+ black_list[5] = Rf_install("group_sizes");
+ black_list[6] = Rf_install("biggest_group_size");
+ black_list[7] = Rf_install("class");
- int n_index = v.size() ;
- for( int j=0; j<n_index; j++){
- res[ v[j] ] = i+1 ;
- }
+ SEXP q = attribs;
+ while (! Rf_isNull(p)) {
+ SEXP tag = TAG(p);
+ if (std::find(black_list.begin(), black_list.end(), tag) == black_list.end()) {
+ Shield<SEXP> s(Rf_cons(CAR(p), R_NilValue));
+ SETCDR(q, s);
+ q = CDR(q);
+ SET_TAG(q, tag);
}
- return res ;
+ p = CDR(p);
+ }
+ return attribs;
}
diff --git a/src/hybrid.cpp b/src/hybrid.cpp
new file mode 100644
index 0000000..08ec6b6
--- /dev/null
+++ b/src/hybrid.cpp
@@ -0,0 +1,198 @@
+#include "pch.h"
+#include <dplyr/main.h>
+
+#include <tools/hash.h>
+
+#include <dplyr/Hybrid.h>
+#include <dplyr/HybridHandlerMap.h>
+
+#include <dplyr/Result/ILazySubsets.h>
+#include <dplyr/Result/Rank.h>
+#include <dplyr/Result/ConstantResult.h>
+
+using namespace Rcpp;
+using namespace dplyr;
+
+bool has_no_class(const RObject& arg) {
+ return RCPP_GET_CLASS(arg) == R_NilValue;
+}
+
+bool hybridable(RObject arg) {
+ if (Rf_inherits(arg, "Date") || Rf_inherits(arg, "POSIXct") || Rf_inherits(arg, "difftime")) return true;
+
+ if (arg.isObject() || arg.isS4()) return false;
+ int type = arg.sexp_type();
+ switch (type) {
+ case INTSXP:
+ case REALSXP:
+ case LGLSXP:
+ case STRSXP:
+ case CPLXSXP:
+ case RAWSXP:
+ return has_no_class(arg);
+ default:
+ break;
+ }
+ return false;
+}
+
+template <template <int> class Templ>
+Result* cumfun_prototype(SEXP call, const ILazySubsets& subsets, int nargs) {
+ if (nargs != 1) return 0;
+ RObject data(CADR(call));
+ if (TYPEOF(data) == SYMSXP) {
+ data = subsets.get_variable(SymbolString(Symbol(data)));
+ }
+ switch (TYPEOF(data)) {
+ case INTSXP:
+ return new Templ<INTSXP>(data);
+ case REALSXP:
+ return new Templ<REALSXP>(data);
+ default:
+ break;
+ }
+ return 0;
+}
+
+HybridHandlerMap& get_handlers() {
+ static HybridHandlerMap handlers;
+ if (!handlers.size()) {
+ /*
+ handlers[ Rf_install( "cumsum") ] = cumfun_prototype<CumSum>;
+ handlers[ Rf_install( "cummin") ] = cumfun_prototype<CumMin>;
+ handlers[ Rf_install( "cummax") ] = cumfun_prototype<CumMax>;
+ */
+
+ install_simple_handlers(handlers);
+ install_minmax_handlers(handlers);
+ install_count_handlers(handlers);
+ install_nth_handlers(handlers);
+ install_window_handlers(handlers);
+ install_offset_handlers(handlers);
+ install_in_handlers(handlers);
+ install_debug_handlers(handlers);
+ }
+ return handlers;
+}
+
+Result* constant_handler(SEXP constant) {
+ switch (TYPEOF(constant)) {
+ case INTSXP:
+ {
+ if (Rf_inherits(constant, "Date")) return new TypedConstantResult<INTSXP>(constant, get_date_classes());
+ return new ConstantResult<INTSXP>(constant);
+ }
+ case REALSXP:
+ {
+ if (Rf_inherits(constant, "difftime")) return new DifftimeConstantResult<REALSXP>(constant);
+ if (Rf_inherits(constant, "POSIXct")) return new TypedConstantResult<REALSXP>(constant, get_time_classes());
+ if (Rf_inherits(constant, "Date")) return new TypedConstantResult<REALSXP>(constant, get_date_classes());
+ return new ConstantResult<REALSXP>(constant);
+ }
+ case STRSXP:
+ return new ConstantResult<STRSXP>(constant);
+ case LGLSXP:
+ return new ConstantResult<LGLSXP>(constant);
+ case CPLXSXP:
+ return new ConstantResult<CPLXSXP>(constant);
+ default:
+ return 0;
+ }
+}
+
+class VariableResult : public Result {
+public:
+ VariableResult(const ILazySubsets& subsets_, const SymbolString& name_) : subsets(subsets_), name(name_) {}
+
+ SEXP process(const GroupedDataFrame&) {
+ if (subsets.is_summary(name)) {
+ // No need to check length since the summary has already been checked
+ return subsets.get_variable(name);
+ } else {
+ stop("VariableResult::process() needs a summary variable");
+ }
+ }
+
+ SEXP process(const RowwiseDataFrame&) {
+ return subsets.get_variable(name);
+ }
+
+ virtual SEXP process(const FullDataFrame&) {
+ return subsets.get_variable(name);
+ }
+
+ virtual SEXP process(const SlicingIndex& index) {
+ return subsets.get(name, index);
+ }
+
+private:
+ const ILazySubsets& subsets;
+ SymbolString name;
+};
+
+Result* variable_handler(const ILazySubsets& subsets, const SymbolString& variable) {
+ return new VariableResult(subsets, variable);
+}
+
+namespace dplyr {
+
+Result* get_handler(SEXP call, const ILazySubsets& subsets, const Environment& env) {
+ LOG_INFO << "Looking up hybrid handler for call of type " << type2name(call);
+
+ if (TYPEOF(call) == LANGSXP) {
+ int depth = Rf_length(call);
+ HybridHandlerMap& handlers = get_handlers();
+ SEXP fun_symbol = CAR(call);
+ if (TYPEOF(fun_symbol) != SYMSXP) {
+ LOG_VERBOSE << "Not a function: " << type2name(fun_symbol);
+ return 0;
+ }
+
+ LOG_VERBOSE << "Searching hybrid handler for function " << CHAR(PRINTNAME(fun_symbol));
+
+ HybridHandlerMap::const_iterator it = handlers.find(fun_symbol);
+ if (it == handlers.end()) {
+ LOG_VERBOSE << "Not found";
+ return 0;
+ }
+
+ LOG_INFO << "Using hybrid handler for " << CHAR(PRINTNAME(fun_symbol));
+
+ return it->second(call, subsets, depth - 1);
+ } else if (TYPEOF(call) == SYMSXP) {
+ SymbolString sym = SymbolString(Symbol(call));
+
+ LOG_VERBOSE << "Searching hybrid handler for symbol " << sym.get_utf8_cstring();
+
+ if (subsets.has_variable(sym)) {
+ if (!subsets.is_summary(sym)) return 0;
+
+ LOG_VERBOSE << "Using hybrid variable handler";
+ return variable_handler(subsets, sym);
+ }
+ else {
+ SEXP data;
+ try {
+ data = env.find(sym.get_string());
+ } catch (Rcpp::binding_not_found) {
+ return NULL;
+ }
+
+ // Constants of length != 1 are handled via regular evaluation
+ if (Rf_length(data) == 1) {
+ LOG_VERBOSE << "Using hybrid constant handler";
+ return constant_handler(data);
+ }
+ }
+ } else {
+ // TODO: perhaps deal with SYMSXP separately
+ if (Rf_length(call) == 1) return constant_handler(call);
+ }
+ return 0;
+}
+
+}
+
+void registerHybridHandler(const char* name, HybridHandler proto) {
+ get_handlers()[Rf_install(name)] = proto;
+}
diff --git a/src/hybrid_count.cpp b/src/hybrid_count.cpp
new file mode 100644
index 0000000..11a0a4c
--- /dev/null
+++ b/src/hybrid_count.cpp
@@ -0,0 +1,56 @@
+#include "pch.h"
+#include <dplyr/main.h>
+
+#include <dplyr/HybridHandlerMap.h>
+
+#include <dplyr/MultipleVectorVisitors.h>
+
+#include <dplyr/Result/ILazySubsets.h>
+
+#include <dplyr/Result/Count.h>
+#include <dplyr/Result/Count_Distinct.h>
+
+using namespace Rcpp;
+using namespace dplyr;
+
+Result* count_prototype(SEXP args, const ILazySubsets&, int) {
+ if (Rf_length(args) != 1)
+ stop("`n()` does not take arguments");
+ return new Count;
+}
+
+Result* count_distinct_prototype(SEXP call, const ILazySubsets& subsets, int) {
+ MultipleVectorVisitors visitors;
+ bool na_rm = false;
+
+ for (SEXP p = CDR(call); !Rf_isNull(p); p = CDR(p)) {
+ SEXP x = maybe_rhs(CAR(p));
+ if (!Rf_isNull(TAG(p)) && TAG(p) == Rf_install("na.rm")) {
+ if (TYPEOF(x) == LGLSXP && Rf_length(x) == 1) {
+ na_rm = LOGICAL(x)[0];
+ } else {
+ stop("incompatible value for `na.rm` argument");
+ }
+ } else if (TYPEOF(x) == SYMSXP) {
+ SymbolString name = SymbolString(Symbol(x));
+ visitors.push_back(subsets.get_variable(name));
+ } else {
+ return 0;
+ }
+ }
+
+ if (visitors.size() == 0) {
+ stop("Need at least one column for `n_distinct()`");
+ }
+
+ if (na_rm) {
+ return new Count_Distinct_Narm<MultipleVectorVisitors>(visitors);
+ } else {
+ return new Count_Distinct<MultipleVectorVisitors>(visitors);
+ }
+}
+
+void install_count_handlers(HybridHandlerMap& handlers) {
+ handlers[ Rf_install("n") ] = count_prototype;
+ handlers[ Rf_install("n_distinct") ] = count_distinct_prototype;
+}
diff --git a/src/hybrid_debug.cpp b/src/hybrid_debug.cpp
new file mode 100644
index 0000000..ac9eaa3
--- /dev/null
+++ b/src/hybrid_debug.cpp
@@ -0,0 +1,93 @@
+#include "pch.h"
+#include <dplyr/main.h>
+
+#include <dplyr/HybridHandlerMap.h>
+
+#include <dplyr/Result/ILazySubsets.h>
+
+#include <dplyr/Result/Result.h>
+
+using namespace Rcpp;
+using namespace dplyr;
+
+
+class VerifyHybrid : public Result {
+public:
+ explicit VerifyHybrid(SEXP x_) : x(x_) {}
+
+public:
+ SEXP process(const RowwiseDataFrame&) {
+ return x;
+ }
+
+ SEXP process(const GroupedDataFrame&) {
+ return x;
+ }
+
+ SEXP process(const FullDataFrame&) {
+ return x;
+ }
+
+ SEXP process(const SlicingIndex&) {
+ return x;
+ }
+
+private:
+ RObject x;
+};
+
+Result* verify_hybrid_prototype(SEXP call, const ILazySubsets&, int nargs) {
+ // if not exactly one arg, let R handle it
+ if (nargs != 1)
+ return 0;
+
+ // if it isn't a constant, let R handle it
+ SEXP arg = CADR(call);
+ if (TYPEOF(arg) == SYMSXP || TYPEOF(arg) == LANGSXP)
+ return 0;
+
+ return new VerifyHybrid(arg);
+}
+
+class VerifyNotHybrid : public Result {
+public:
+ explicit VerifyNotHybrid(SEXP x_) : x(x_) {}
+
+public:
+ SEXP process(const RowwiseDataFrame&) {
+ stop("In hybrid evaluation");
+ }
+
+ SEXP process(const GroupedDataFrame&) {
+ stop("In hybrid evaluation");
+ }
+
+ SEXP process(const FullDataFrame&) {
+ stop("In hybrid evaluation");
+ }
+
+ SEXP process(const SlicingIndex&) {
+ stop("In hybrid evaluation");
+ }
+
+private:
+ RObject x;
+};
+
+Result* verify_not_hybrid_prototype(SEXP call, const ILazySubsets&, int nargs) {
+ // if not exactly one arg, let R handle it
+ if (nargs != 1)
+ return 0;
+
+ // if it isn't a constant, let R handle it
+ SEXP arg = CADR(call);
+ if (TYPEOF(arg) == SYMSXP || TYPEOF(arg) == LANGSXP)
+ return 0;
+
+ return new VerifyNotHybrid(arg);
+}
+
+void install_debug_handlers(HybridHandlerMap& handlers) {
+ handlers[ Rf_install("verify_hybrid") ] = verify_hybrid_prototype;
+ handlers[ Rf_install("verify_not_hybrid") ] = verify_not_hybrid_prototype;
+}
diff --git a/src/hybrid_in.cpp b/src/hybrid_in.cpp
new file mode 100644
index 0000000..e7e473e
--- /dev/null
+++ b/src/hybrid_in.cpp
@@ -0,0 +1,52 @@
+#include "pch.h"
+#include <dplyr/main.h>
+
+#include <dplyr/HybridHandlerMap.h>
+
+#include <dplyr/Result/ILazySubsets.h>
+
+#include <dplyr/Result/In.h>
+
+using namespace Rcpp;
+using namespace dplyr;
+
+Result* in_prototype(SEXP call, const ILazySubsets& subsets, int) {
+ SEXP lhs = maybe_rhs(CADR(call));
+ SEXP rhs = maybe_rhs(CADDR(call));
+
+ // if lhs is not a symbol, let R handle it
+ if (TYPEOF(lhs) != SYMSXP) return 0;
+
+ SymbolString name = SymbolString(Symbol(lhs));
+
+ // if the lhs is not in the data, let R handle it
+ if (!subsets.has_variable(name)) return 0;
+
+ SEXP v = subsets.get_variable(name);
+
+ // if the type of the data is not the same as the type of rhs,
+ // including if it needs evaluation, let R handle it
+ if (TYPEOF(v) != TYPEOF(rhs)) return 0;
+
+ // otherwise use hybrid version
+ switch (TYPEOF(v)) {
+ case LGLSXP:
+ return new In<LGLSXP>(v, rhs);
+ case INTSXP:
+ return new In<INTSXP>(v, rhs);
+ case REALSXP:
+ return new In<REALSXP>(v, rhs);
+ case STRSXP:
+ return new In<STRSXP>(v, rhs);
+ default:
+ break;
+ }
+
+ // type not handled
+ return 0;
+
+}
+
+void install_in_handlers(HybridHandlerMap& handlers) {
+ handlers[ Rf_install("%in%") ] = in_prototype;
+}
diff --git a/src/hybrid_minmax.cpp b/src/hybrid_minmax.cpp
new file mode 100644
index 0000000..bf5cb17
--- /dev/null
+++ b/src/hybrid_minmax.cpp
@@ -0,0 +1,72 @@
+#include "pch.h"
+#include <dplyr/main.h>
+
+#include <dplyr/HybridHandlerMap.h>
+
+#include <dplyr/Result/ILazySubsets.h>
+
+#include <dplyr/Result/MinMax.h>
+
+using namespace Rcpp;
+using namespace dplyr;
+
+template<bool MINIMUM, bool NA_RM>
+Result* minmax_prototype_impl(SEXP arg, bool is_summary) {
+ arg = maybe_rhs(arg);
+ if (!hybridable(arg)) return 0;
+
+ switch (TYPEOF(arg)) {
+ case INTSXP:
+ return new MinMax<INTSXP, MINIMUM, NA_RM>(arg, is_summary);
+ case REALSXP:
+ return new MinMax<REALSXP, MINIMUM, NA_RM>(arg, is_summary);
+ default:
+ break;
+ }
+ return 0;
+}
+
+template<bool MINIMUM>
+Result* minmax_prototype(SEXP call, const ILazySubsets& subsets, int nargs) {
+ using namespace dplyr;
+ // we only can handle 1 or two arguments
+ if (nargs == 0 || nargs > 2) return 0;
+
+ // the first argument is the data to operate on
+ SEXP arg = maybe_rhs(CADR(call));
+
+ bool is_summary = false;
+ if (TYPEOF(arg) == SYMSXP) {
+ SymbolString name = SymbolString(Symbol(arg));
+ if (subsets.has_variable(name)) {
+ is_summary = subsets.is_summary(name);
+ arg = subsets.get_variable(name);
+ }
+ else return 0;
+ } else {
+ return 0;
+ }
+
+ if (nargs == 1) {
+ return minmax_prototype_impl<MINIMUM, false>(arg, is_summary);
+ } else if (nargs == 2) {
+ SEXP arg2 = CDDR(call);
+ // we know how to handle fun( ., na.rm = TRUE/FALSE )
+ if (TAG(arg2) == R_NaRmSymbol) {
+ SEXP narm = CAR(arg2);
+ if (TYPEOF(narm) == LGLSXP && LENGTH(narm) == 1) {
+ if (LOGICAL(narm)[0] == TRUE) {
+ return minmax_prototype_impl<MINIMUM, true>(arg, is_summary);
+ } else {
+ return minmax_prototype_impl<MINIMUM, false>(arg, is_summary);
+ }
+ }
+ }
+ }
+ return 0;
+}
+
+void install_minmax_handlers(HybridHandlerMap& handlers) {
+ handlers[Rf_install("min")] = minmax_prototype<true>;
+ handlers[Rf_install("max")] = minmax_prototype<false>;
+}
diff --git a/src/hybrid_nth.cpp b/src/hybrid_nth.cpp
new file mode 100644
index 0000000..5176e7a
--- /dev/null
+++ b/src/hybrid_nth.cpp
@@ -0,0 +1,322 @@
+#include "pch.h"
+#include <dplyr/main.h>
+
+#include <dplyr/Order.h>
+#include <dplyr/HybridHandlerMap.h>
+
+#include <dplyr/Result/Processor.h>
+#include <dplyr/Result/ILazySubsets.h>
+#include <dplyr/Result/VectorSliceVisitor.h>
+
+using namespace Rcpp;
+using namespace dplyr;
+
+namespace dplyr {
+
+bool argmatch(const std::string& target, const std::string& s) {
+ if (s.size() > target.size()) return false;
+ return target.compare(0, s.size(), s) == 0;
+}
+
+template <int RTYPE>
+class Nth : public Processor< RTYPE, Nth<RTYPE> > {
+public:
+ typedef Processor< RTYPE, Nth<RTYPE> > Base;
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+
+ Nth(Vector<RTYPE> data_, int idx_, STORAGE def_ = Vector<RTYPE>::get_na()) :
+ Base(data_),
+ data(data_),
+ idx(idx_),
+ def(def_) {}
+
+ inline STORAGE process_chunk(const SlicingIndex& indices) {
+ int n = indices.size();
+ if (n == 0 || idx > n || idx < -n) return def;
+ int i = idx > 0 ? (idx - 1) : (n + idx);
+ return data[indices[i]];
+ }
+
+private:
+ Vector<RTYPE> data;
+ int idx;
+ STORAGE def;
+};
+
+template <int RTYPE, int ORDER_RTYPE>
+class NthWith : public Processor< RTYPE, NthWith<RTYPE, ORDER_RTYPE> > {
+public:
+ typedef Processor< RTYPE, NthWith<RTYPE, ORDER_RTYPE> > Base;
+ typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE;
+
+ NthWith(Vector<RTYPE> data_, int idx_, Vector<ORDER_RTYPE> order_, STORAGE def_ = Vector<RTYPE>::get_na()) :
+ Base(data_),
+ data(data_),
+ idx(idx_),
+ order(order_),
+ def(def_) {}
+
+ inline STORAGE process_chunk(const SlicingIndex& indices) {
+ int n = indices.size();
+ if (n == 0 || idx > n || idx < -n) return def;
+
+ int i = idx > 0 ? (idx - 1) : (n + idx);
+
+ typedef VectorSliceVisitor<ORDER_RTYPE> Slice;
+ typedef OrderVectorVisitorImpl<ORDER_RTYPE, true, Slice> Visitor;
+ typedef Compare_Single_OrderVisitor<Visitor> Comparer;
+
+ // Need explicit variables because constructors take const&, and this does not work
+ // with unnamed temporaries.
+ Slice slice(&order, indices);
+ Visitor visitor(slice);
+ Comparer comparer(visitor);
+
+ IntegerVector sequence = seq(0, n - 1);
+ std::nth_element(sequence.begin(), sequence.begin() + i, sequence.end(), comparer);
+
+ return data[ indices[ sequence[i] ] ];
+ }
+
+private:
+ Vector<RTYPE> data;
+ int idx;
+ Vector<ORDER_RTYPE> order;
+ STORAGE def;
+};
+
+Result* nth_(SEXP data, int idx) {
+ switch (TYPEOF(data)) {
+ case LGLSXP:
+ return new Nth<LGLSXP>(data, idx);
+ case INTSXP:
+ return new Nth<INTSXP>(data, idx);
+ case REALSXP:
+ return new Nth<REALSXP>(data, idx);
+ case CPLXSXP:
+ return new Nth<CPLXSXP>(data, idx);
+ case STRSXP:
+ return new Nth<STRSXP>(data, idx);
+ default:
+ return 0;
+ }
+}
+
+template <int RTYPE>
+Result* nth_noorder_default(Vector<RTYPE> data, int idx, Vector<RTYPE> def) {
+ return new Nth<RTYPE>(data, idx, def[0]);
+}
+
+Result* nth_noorder_default_(SEXP data, int idx, SEXP def) {
+ switch (TYPEOF(data)) {
+ case LGLSXP:
+ return nth_noorder_default<LGLSXP>(data, idx, def);
+ case INTSXP:
+ return nth_noorder_default<INTSXP>(data, idx, def);
+ case REALSXP:
+ return nth_noorder_default<REALSXP>(data, idx, def);
+ case CPLXSXP:
+ return nth_noorder_default<CPLXSXP>(data, idx, def);
+ case STRSXP:
+ return nth_noorder_default<STRSXP>(data, idx, def);
+ default:
+ return 0;
+ }
+}
+
+template <int RTYPE>
+Result* nth_with(Vector<RTYPE> data, int idx, SEXP order) {
+ switch (TYPEOF(order)) {
+ case LGLSXP:
+ return new NthWith<RTYPE, LGLSXP>(data, idx, order);
+ case INTSXP:
+ return new NthWith<RTYPE, INTSXP>(data, idx, order);
+ case REALSXP:
+ return new NthWith<RTYPE, REALSXP>(data, idx, order);
+ case CPLXSXP:
+ return new NthWith<RTYPE, CPLXSXP>(data, idx, order);
+ case STRSXP:
+ return new NthWith<RTYPE, STRSXP>(data, idx, order);
+ default:
+ break;
+ }
+ bad_arg(SymbolString("order"), "is of unsupported type %s", Rf_type2char(TYPEOF(order)));
+}
+
+Result* nth_with_(SEXP data, int idx, SEXP order_by) {
+ switch (TYPEOF(data)) {
+ case LGLSXP:
+ return nth_with<LGLSXP>(data, idx, order_by);
+ case INTSXP:
+ return nth_with<INTSXP>(data, idx, order_by);
+ case REALSXP:
+ return nth_with<REALSXP>(data, idx, order_by);
+ case CPLXSXP:
+ return nth_with<CPLXSXP>(data, idx, order_by);
+ case STRSXP:
+ return nth_with<STRSXP>(data, idx, order_by);
+ default:
+ return 0;
+ }
+}
+
+template <int RTYPE>
+Result* nth_with_default(Vector<RTYPE> data, int idx, SEXP order, Vector<RTYPE> def) {
+ switch (TYPEOF(order)) {
+ case LGLSXP:
+ return new NthWith<RTYPE, LGLSXP>(data, idx, order, def[0]);
+ case INTSXP:
+ return new NthWith<RTYPE, INTSXP>(data, idx, order, def[0]);
+ case REALSXP:
+ return new NthWith<RTYPE, REALSXP>(data, idx, order, def[0]);
+ case CPLXSXP:
+ return new NthWith<RTYPE, CPLXSXP>(data, idx, order, def[0]);
+ case STRSXP:
+ return new NthWith<RTYPE, STRSXP>(data, idx, order, def[0]);
+ default:
+ break;
+ }
+ bad_arg(SymbolString("order"), "is of unsupported type %s", Rf_type2char(TYPEOF(order)));
+}
+
+Result* nth_with_default_(SEXP data, int idx, SEXP order_by, SEXP def) {
+ switch (TYPEOF(data)) {
+ case LGLSXP:
+ return nth_with_default<LGLSXP>(data, idx, order_by, def);
+ case INTSXP:
+ return nth_with_default<INTSXP>(data, idx, order_by, def);
+ case REALSXP:
+ return nth_with_default<REALSXP>(data, idx, order_by, def);
+ case CPLXSXP:
+ return nth_with_default<CPLXSXP>(data, idx, order_by, def);
+ case STRSXP:
+ return nth_with_default<STRSXP>(data, idx, order_by, def);
+ default:
+ return 0;
+ }
+}
+
+Result* nth_prototype(SEXP call, const ILazySubsets& subsets, int nargs) {
+ // has to have at least two arguments
+ if (nargs < 2) return 0;
+
+ SEXP tag = TAG(CDR(call));
+ if (tag != R_NilValue && tag != Rf_install("x")) {
+ return 0;
+ }
+ SEXP data = maybe_rhs(CADR(call));
+ if (TYPEOF(data) != SYMSXP)
+ return 0;
+
+ SymbolString name = SymbolString(Symbol(data));
+ if (subsets.has_non_summary_variable(name) == 0) {
+ return 0;
+ }
+ data = subsets.get_variable(name);
+
+ tag = TAG(CDDR(call));
+ if (tag != R_NilValue && tag != Rf_install("n")) {
+ return 0;
+ }
+ SEXP nidx = CADDR(call);
+ if ((TYPEOF(nidx) != REALSXP && TYPEOF(nidx) != INTSXP) || LENGTH(nidx) != 1) {
+ // we only know how to handle the case where nidx is a length one
+ // integer or numeric. In any other case, e.g. an expression for R to evaluate
+ // we just fallback to R evaluation (#734)
+ return 0;
+ }
+ int idx = as<int>(nidx);
+
+ // easy case : just a single variable: first(x,n)
+ if (nargs == 2) {
+ return nth_(data, idx);
+ }
+
+ // now get `order_by` and `default`
+ SEXP order_by = R_NilValue;
+ SEXP def = R_NilValue;
+ bool has_order_by = false;
+ bool has_default = false;
+
+ SEXP p = CDR(CDDR(call));
+ while (p != R_NilValue) {
+ SEXP tag = TAG(p);
+ if (!has_order_by && (Rf_isNull(tag) || argmatch("order_by", CHAR(PRINTNAME(tag))))) {
+ order_by = CAR(p);
+ has_order_by = true;
+ }
+ else if (!has_default && (Rf_isNull(tag) || argmatch("default", CHAR(PRINTNAME(tag))))) {
+ def = CAR(p);
+ if (TYPEOF(def) == SYMSXP || TYPEOF(def) == LANGSXP) return 0;
+ has_default = true;
+ }
+ else {
+ return 0;
+ }
+
+ p = CDR(p);
+ }
+
+ // handle cases
+ if (Rf_isNull(def)) {
+ // then we know order_by is not NULL, we only handle the case where
+ // order_by is a symbol and that symbol is in the data
+ if (TYPEOF(order_by) != SYMSXP)
+ return 0;
+
+ SymbolString order_by_name = SymbolString(Symbol(order_by));
+ if (subsets.has_non_summary_variable(order_by_name) == 0)
+ return 0;
+
+ order_by = subsets.get_variable(order_by_name);
+
+ return nth_with_(data, idx, order_by);
+ }
+
+ if (Rf_isNull(order_by)) {
+ return nth_noorder_default_(data, idx, def);
+ }
+
+ if (TYPEOF(order_by) != SYMSXP)
+ return 0;
+
+ SymbolString order_by_name = SymbolString(Symbol(order_by));
+ if (subsets.has_non_summary_variable(order_by_name) == 0)
+ return 0;
+
+ order_by = subsets.get_variable(order_by_name);
+
+ return nth_with_default_(data, idx, order_by, def);
+}
+
+Result* firstlast_prototype(SEXP call, const ILazySubsets& subsets, int nargs, int pos) {
+ SEXP tail = CDDR(call);
+
+ SETCAR(call, Rf_install("nth"));
+
+ Pairlist p(pos);
+ if (Rf_isNull(tail)) {
+ SETCDR(CDR(call), p);
+ } else {
+ SETCDR(p, tail);
+ SETCDR(CDR(call), p);
+ }
+ Result* res = nth_prototype(call, subsets, nargs + 1);
+ return res;
+}
+
+Result* first_prototype(SEXP call, const ILazySubsets& subsets, int nargs) {
+ return firstlast_prototype(call, subsets, nargs, 1);
+}
+
+Result* last_prototype(SEXP call, const ILazySubsets& subsets, int nargs) {
+ return firstlast_prototype(call, subsets, nargs, -1);
+}
+
+}
+
+void install_nth_handlers(HybridHandlerMap& handlers) {
+ handlers[ Rf_install("first") ] = first_prototype;
+ handlers[ Rf_install("last") ] = last_prototype;
+ handlers[ Rf_install("nth") ] = nth_prototype;
+}
diff --git a/src/hybrid_offset.cpp b/src/hybrid_offset.cpp
new file mode 100644
index 0000000..6c6ade5
--- /dev/null
+++ b/src/hybrid_offset.cpp
@@ -0,0 +1,98 @@
+#include "pch.h"
+#include <dplyr/main.h>
+
+#include <dplyr/HybridHandlerMap.h>
+
+#include <dplyr/Result/ILazySubsets.h>
+
+#include <dplyr/Result/Lead.h>
+#include <dplyr/Result/Lag.h>
+
+using namespace Rcpp;
+using namespace dplyr;
+
+struct LeadLag {
+
+ explicit LeadLag(SEXP call) : data(R_NilValue), n(1), def(R_NilValue), ok(false) {
+
+ SEXP p = CDR(call);
+ SEXP tag = TAG(p);
+ if (tag != R_NilValue && tag != Rf_install("x"))
+ return;
+ data = maybe_rhs(CAR(p));
+ p = CDR(p);
+
+ SEXP tag_default = Rf_install("default");
+ SEXP tag_n = Rf_install("n");
+ bool got_n = false;
+ bool got_default = false;
+
+ while (!Rf_isNull(p)) {
+ tag = TAG(p);
+ if (!Rf_isNull(tag) && tag != tag_n && tag != tag_default)
+ return;
+ if (!got_n && (Rf_isNull(tag) || tag == tag_n)) {
+ SEXP n_ = CAR(p);
+ if (TYPEOF(n_) != INTSXP && TYPEOF(n_) != REALSXP)
+ return;
+ n = as<int>(n_);
+ got_n = true;
+ }
+ else if (!got_default && (Rf_isNull(tag) || tag == tag_default)) {
+ def = CAR(p);
+ if (TYPEOF(def) == LANGSXP) return;
+ got_default = true;
+ }
+ else
+ return;
+ p = CDR(p);
+ }
+
+ ok = true;
+ }
+
+ RObject data;
+ int n;
+ RObject def;
+
+ bool ok;
+
+};
+
+template < template<int> class Templ>
+Result* leadlag_prototype(SEXP call, const ILazySubsets& subsets, int) {
+ LeadLag args(call);
+ if (!args.ok) return 0;
+ RObject& data = args.data;
+
+ if (TYPEOF(data) != SYMSXP)
+ return 0;
+
+ SymbolString name = SymbolString(Symbol(data));
+ if (subsets.has_variable(name) == 0)
+ return 0;
+
+ bool is_summary = subsets.is_summary(name);
+ int n = args.n;
+ data = subsets.get_variable(name);
+
+ switch (TYPEOF(data)) {
+ case INTSXP:
+ return new Templ<INTSXP>(data, n, args.def, is_summary);
+ case REALSXP:
+ return new Templ<REALSXP>(data, n, args.def, is_summary);
+ case CPLXSXP:
+ return new Templ<CPLXSXP>(data, n, args.def, is_summary);
+ case STRSXP:
+ return new Templ<STRSXP>(data, n, args.def, is_summary);
+ case LGLSXP:
+ return new Templ<LGLSXP>(data, n, args.def, is_summary);
+ default:
+ return 0;
+ }
+}
+
+void install_offset_handlers(HybridHandlerMap& handlers) {
+ handlers[ Rf_install("lead") ] = leadlag_prototype<Lead>;
+ handlers[ Rf_install("lag") ] = leadlag_prototype<Lag>;
+}
diff --git a/src/hybrid_simple.cpp b/src/hybrid_simple.cpp
new file mode 100644
index 0000000..6c7ef14
--- /dev/null
+++ b/src/hybrid_simple.cpp
@@ -0,0 +1,78 @@
+#include "pch.h"
+#include <dplyr/main.h>
+
+#include <dplyr/HybridHandlerMap.h>
+
+#include <dplyr/Result/ILazySubsets.h>
+
+#include <dplyr/Result/Mean.h>
+#include <dplyr/Result/Sum.h>
+#include <dplyr/Result/Var.h>
+#include <dplyr/Result/Sd.h>
+
+using namespace Rcpp;
+using namespace dplyr;
+
+template <template <int, bool> class Fun, bool narm>
+Result* simple_prototype_impl(SEXP arg, bool is_summary) {
+ // if not hybridable, just let R handle it
+ if (!hybridable(arg)) return 0;
+
+ switch (TYPEOF(arg)) {
+ case INTSXP:
+ return new Fun<INTSXP, narm>(arg, is_summary);
+ case REALSXP:
+ return new Fun<REALSXP, narm>(arg, is_summary);
+ default:
+ break;
+ }
+ return 0;
+}
+
+template <template <int, bool> class Fun>
+Result* simple_prototype(SEXP call, const ILazySubsets& subsets, int nargs) {
+ if (nargs == 0) return 0;
+ SEXP arg = maybe_rhs(CADR(call));
+ bool is_summary = false;
+ if (TYPEOF(arg) == SYMSXP) {
+ SymbolString name = SymbolString(Symbol(arg));
+ if (subsets.has_variable(name)) {
+ // we have a symbol from the data - great
+ is_summary = subsets.is_summary(name);
+ arg = subsets.get_variable(name);
+ } else {
+ // we have a symbol but we don't know about it, so we give up and let R evaluation handle it
+ return 0;
+ }
+ } else {
+ // anything else: expressions, constants ...
+ // workaround for now : we just let R deal with it
+ // of course this needs some specializations, i.e. sum(1) does not need R to get involved
+ return 0;
+ }
+
+ if (nargs == 1) {
+ return simple_prototype_impl<Fun, false>(arg, is_summary);
+ } else if (nargs == 2) {
+ SEXP arg2 = CDDR(call);
+ // we know how to handle fun( ., na.rm = TRUE/FALSE )
+ if (TAG(arg2) == R_NaRmSymbol) {
+ SEXP narm = CAR(arg2);
+ if (TYPEOF(narm) == LGLSXP && LENGTH(narm) == 1) {
+ if (LOGICAL(narm)[0] == TRUE) {
+ return simple_prototype_impl<Fun, true>(arg, is_summary);
+ } else {
+ return simple_prototype_impl<Fun, false>(arg, is_summary);
+ }
+ }
+ }
+ }
+ return 0;
+}
+
+void install_simple_handlers(HybridHandlerMap& handlers) {
+ handlers[ Rf_install("mean") ] = simple_prototype<dplyr::Mean>;
+ handlers[ Rf_install("var") ] = simple_prototype<dplyr::Var>;
+ handlers[ Rf_install("sd") ] = simple_prototype<dplyr::Sd>;
+ handlers[ Rf_install("sum") ] = simple_prototype<dplyr::Sum>;
+}
diff --git a/src/hybrid_window.cpp b/src/hybrid_window.cpp
new file mode 100644
index 0000000..768b1e3
--- /dev/null
+++ b/src/hybrid_window.cpp
@@ -0,0 +1,166 @@
+#include "pch.h"
+#include <dplyr/main.h>
+
+#include <dplyr/HybridHandlerMap.h>
+
+#include <dplyr/Result/ILazySubsets.h>
+#include <dplyr/Result/Rank.h>
+
+using namespace Rcpp;
+using namespace dplyr;
+
+namespace dplyr {
+
+template <bool ascending>
+Result* row_number_asc(const RObject& data) {
+ switch (TYPEOF(data)) {
+ case INTSXP:
+ return new RowNumber<INTSXP, ascending>(data);
+ case REALSXP:
+ return new RowNumber<REALSXP, ascending>(data);
+ case STRSXP:
+ return new RowNumber<STRSXP, ascending>(data);
+ default:
+ return 0;
+ }
+}
+
+Result* row_number(const RObject& data, const bool ascending) {
+ if (ascending) {
+ return row_number_asc<true>(data);
+ }
+ else {
+ return row_number_asc<false>(data);
+ }
+}
+
+Result* row_number_prototype(SEXP call, const ILazySubsets& subsets, int nargs) {
+ if (nargs > 1 || subsets.size() == 0) return 0;
+
+ if (nargs == 0) return new RowNumber_0();
+
+ RObject data(CADR(call));
+ bool ascending = true;
+ if (TYPEOF(data) == LANGSXP && CAR(data) == Rf_install("desc")) {
+ data = CADR(data);
+ ascending = false;
+ }
+
+ if (TYPEOF(data) == SYMSXP) {
+ SymbolString name = SymbolString(Symbol(data));
+ if (subsets.has_non_summary_variable(name)) data = subsets.get_variable(name);
+ else return 0;
+ }
+
+ if (subsets.nrows() != Rf_length(data)) return 0;
+
+ return row_number(data, ascending);
+}
+
+template <bool ascending>
+Result* ntile_asc(const RObject& data, const int number_tiles) {
+ switch (TYPEOF(data)) {
+ case INTSXP:
+ return new Ntile<INTSXP, ascending>(data, number_tiles);
+ case REALSXP:
+ return new Ntile<REALSXP, ascending>(data, number_tiles);
+ case STRSXP:
+ return new Ntile<STRSXP, ascending>(data, number_tiles);
+ default:
+ return 0;
+ }
+}
+
+Result* ntile(const RObject& data, const int number_tiles, const bool ascending) {
+ if (ascending) {
+ return ntile_asc<true>(data, number_tiles);
+ }
+ else {
+ return ntile_asc<false>(data, number_tiles);
+ }
+}
+
+Result* ntile_prototype(SEXP call, const ILazySubsets& subsets, int nargs) {
+ if (nargs != 2) return 0;
+
+ // handle 2nd arg
+ SEXP ntiles = maybe_rhs(CADDR(call));
+ if (TYPEOF(ntiles) != INTSXP && TYPEOF(ntiles) != REALSXP) return 0;
+ int number_tiles = as<int>(ntiles);
+ if (number_tiles == NA_INTEGER) return 0;
+
+ RObject data(maybe_rhs(CADR(call)));
+ bool ascending = true;
+ if (TYPEOF(data) == LANGSXP && CAR(data) == Rf_install("desc")) {
+ data = CADR(data);
+ ascending = false;
+ }
+
+ if (TYPEOF(data) == SYMSXP) {
+ SymbolString name = SymbolString(Symbol(data));
+ if (subsets.has_non_summary_variable(name)) data = subsets.get_variable(name);
+ else return 0;
+ }
+
+ if (subsets.nrows() != Rf_length(data)) return 0;
+
+ return ntile(data, number_tiles, ascending);
+}
+
+template <typename Increment, bool ascending>
+Result* rank_asc(const RObject& data) {
+ switch (TYPEOF(data)) {
+ case INTSXP:
+ return new Rank_Impl<INTSXP, Increment, ascending>(data);
+ case REALSXP:
+ return new Rank_Impl<REALSXP, Increment, ascending>(data);
+ case STRSXP:
+ return new Rank_Impl<STRSXP, Increment, ascending>(data);
+ default:
+ return 0;
+ }
+}
+
+template <typename Increment>
+Result* rank(const RObject& data, bool ascending) {
+ if (ascending) {
+ return rank_asc<Increment, true>(data);
+ }
+ else {
+ return rank_asc<Increment, false>(data);
+ }
+}
+
+template <typename Increment>
+Result* rank_impl_prototype(SEXP call, const ILazySubsets& subsets, int nargs) {
+ if (nargs != 1) return 0;
+
+ RObject data(maybe_rhs(CADR(call)));
+ bool ascending = true;
+
+ if (TYPEOF(data) == LANGSXP && CAR(data) == Rf_install("desc")) {
+ data = maybe_rhs(CADR(data));
+ ascending = false;
+ }
+
+ if (TYPEOF(data) == SYMSXP) {
+ SymbolString name = SymbolString(Symbol(data));
+ if (subsets.has_non_summary_variable(name)) data = subsets.get_variable(name);
+ else return 0;
+ }
+
+ if (subsets.nrows() != Rf_length(data)) return 0;
+
+ return rank<Increment>(data, ascending);
+}
+
+}
+
+void install_window_handlers(HybridHandlerMap& handlers) {
+ handlers[ Rf_install("row_number") ] = row_number_prototype;
+ handlers[ Rf_install("ntile") ] = ntile_prototype;
+ handlers[ Rf_install("min_rank") ] = rank_impl_prototype<dplyr::internal::min_rank_increment>;
+ handlers[ Rf_install("percent_rank") ] = rank_impl_prototype<dplyr::internal::percent_rank_increment>;
+ handlers[ Rf_install("dense_rank") ] = rank_impl_prototype<dplyr::internal::dense_rank_increment>;
+ handlers[ Rf_install("cume_dist") ] = rank_impl_prototype<dplyr::internal::cume_dist_increment>;
+}
diff --git a/src/init.cpp b/src/init.cpp
index fd96b81..cf29032 100644
--- a/src/init.cpp
+++ b/src/init.cpp
@@ -1,38 +1,40 @@
-#include <Rcpp.h>
-#include <dplyr.h>
+#include "pch.h"
+#include <dplyr/main.h>
-using namespace Rcpp ;
-using namespace dplyr ;
+#include <dplyr/registration.h>
+#include <dplyr/HybridHandler.h>
-#define DPLYR_REGISTER(__FUN__) R_RegisterCCallable( "dplyr", #__FUN__, (DL_FUNC)__FUN__ );
+using namespace Rcpp;
+using namespace dplyr;
-SEXP get_cache(){
- static SEXP cache = 0;
- if( !cache ){
- SEXP vec = PROTECT(Rf_allocVector(VECSXP, 2)) ;
- SEXP date_classes = PROTECT(Rf_mkString("Date")) ;
- SET_VECTOR_ELT( vec, 0, date_classes) ;
- CharacterVector time_classes = CharacterVector::create( "POSIXct", "POSIXt" ) ;
- SET_VECTOR_ELT( vec, 1, time_classes) ;
- UNPROTECT(2) ;
- R_PreserveObject(vec) ;
- cache = vec ;
- }
- return cache ;
+SEXP get_cache() {
+ static SEXP cache = 0;
+ if (!cache) {
+ SEXP vec = PROTECT(Rf_allocVector(VECSXP, 2));
+ SEXP date_classes = PROTECT(Rf_mkString("Date"));
+ SET_VECTOR_ELT(vec, 0, date_classes);
+ CharacterVector time_classes = CharacterVector::create("POSIXct", "POSIXt");
+ SET_VECTOR_ELT(vec, 1, time_classes);
+ UNPROTECT(2);
+ R_PreserveObject(vec);
+ cache = vec;
+ }
+ return cache;
}
-SEXP get_date_classes(){
- return VECTOR_ELT(get_cache(), 0) ;
+// [[Rcpp::interfaces(cpp)]]
+// [[Rcpp::export]]
+SEXP get_date_classes() {
+ return VECTOR_ELT(get_cache(), 0);
}
-SEXP get_time_classes(){
- return VECTOR_ELT(get_cache(), 1) ;
-}
-
-extern "C" void R_init_dplyr( DllInfo* info ){
- DPLYR_REGISTER(build_index_cpp)
- DPLYR_REGISTER(registerHybridHandler)
- DPLYR_REGISTER(get_time_classes)
- DPLYR_REGISTER(get_date_classes)
+// [[Rcpp::interfaces(cpp)]]
+// [[Rcpp::export]]
+SEXP get_time_classes() {
+ return VECTOR_ELT(get_cache(), 1);
}
+// work around a problem (?) in Rcpp
+// [[Rcpp::interfaces(cpp)]]
+// [[Rcpp::export]]
+DataFrame build_index_cpp(DataFrame data);
diff --git a/src/join.cpp b/src/join.cpp
index eda6b37..b3e0d03 100644
--- a/src/join.cpp
+++ b/src/join.cpp
@@ -1,426 +1,267 @@
-#include <dplyr.h>
-using namespace dplyr ;
-using namespace Rcpp ;
+#include "pch.h"
+#include <dplyr/main.h>
-namespace dplyr{
+#include <tools/encoding.h>
+#include <tools/SymbolString.h>
- bool same_levels( SEXP left, SEXP right ){
- SEXP s_levels = Rf_install("levels") ;
- CharacterVector levels_left = Rf_getAttrib(left,s_levels) ;
- CharacterVector levels_right = Rf_getAttrib(right,s_levels) ;
- if( (SEXP)levels_left == (SEXP)levels_right ) return true ;
- int n = levels_left.size() ;
- if( n != levels_right.size() ) return false ;
+#include <dplyr/JoinVisitorImpl.h>
- for( int i=0; i<n; i++) {
- if( levels_right[i] != levels_left[i] ) return false ;
- }
-
- return true ;
- }
-
- inline bool is_bare_vector( SEXP x){
- SEXP att = ATTRIB(x) ;
-
- // only allow R_Names. as in R's do_isvector
- while( att != R_NilValue ){
- SEXP tag = TAG(att) ;
- if( !( tag == R_NamesSymbol || tag == Rf_install("comment") ) ) return false ;
- att = CDR(att) ;
- }
-
- return true ;
- }
+namespace dplyr {
+inline bool is_bare_vector(SEXP x) {
+ SEXP att = ATTRIB(x);
- // -------------- (int,lgl)
- template <int LHS_RTYPE, int RHS_RTYPE>
- inline size_t hash_int_int( JoinVisitorImpl<LHS_RTYPE,RHS_RTYPE>& joiner, int i){
- return joiner.RHS_hash_fun( i>=0 ? joiner.left[i] : joiner.right[-i-1] ) ;
- }
- template <>
- inline size_t JoinVisitorImpl<INTSXP,LGLSXP>::hash( int i){
- return hash_int_int<INTSXP,LGLSXP>( *this, i) ;
- }
- template <>
- inline size_t JoinVisitorImpl<LGLSXP,INTSXP>::hash( int i){
- return hash_int_int<LGLSXP,INTSXP>( *this, i) ;
- }
- template <int LHS_RTYPE, int RHS_RTYPE>
- inline SEXP subset_join_int_int( JoinVisitorImpl<LHS_RTYPE,RHS_RTYPE>& joiner, const std::vector<int>& indices ){
- int n = indices.size() ;
- IntegerVector res = no_init(n) ;
- for( int i=0; i<n; i++) {
- int index = indices[i] ;
- if( index >= 0 ){
- res[i] = joiner.left[index] ;
- } else {
- res[i] = joiner.right[-index-1] ;
- }
- }
- return res ;
- }
- template <>
- inline SEXP JoinVisitorImpl<INTSXP,LGLSXP>::subset( const std::vector<int>& indices ){
- return subset_join_int_int<INTSXP,LGLSXP>( *this, indices ) ;
- }
- template <>
- inline SEXP JoinVisitorImpl<LGLSXP,INTSXP>::subset( const std::vector<int>& indices ){
- return subset_join_int_int<LGLSXP,INTSXP>( *this, indices ) ;
- }
-
- template <int LHS_RTYPE, int RHS_RTYPE>
- inline SEXP subset_join_int_int( JoinVisitorImpl<LHS_RTYPE,RHS_RTYPE>& joiner, const VisitorSetIndexSet<DataFrameJoinVisitors>& set ){
- int n = set.size() ;
- IntegerVector res = no_init(n) ;
- VisitorSetIndexSet<DataFrameJoinVisitors>::const_iterator it=set.begin() ;
- for( int i=0; i<n; i++, ++it) {
- int index = *it ;
- if( index >= 0 ){
- res[i] = joiner.left[index] ;
- } else {
- res[i] = joiner.right[-index-1] ;
- }
- }
- return res ;
- }
- template <>
- inline SEXP JoinVisitorImpl<INTSXP,LGLSXP>::subset( const VisitorSetIndexSet<DataFrameJoinVisitors>& set ){
- return subset_join_int_int<INTSXP,LGLSXP>( *this, set ) ;
- }
- template <>
- inline SEXP JoinVisitorImpl<LGLSXP,INTSXP>::subset( const VisitorSetIndexSet<DataFrameJoinVisitors>& set ){
- return subset_join_int_int<LGLSXP,INTSXP>( *this, set ) ;
- }
+ // only allow R_Names. as in R's do_isvector
+ while (att != R_NilValue) {
+ SEXP tag = TAG(att);
+ if (!(tag == R_NamesSymbol || tag == Rf_install("comment"))) return false;
+ att = CDR(att);
+ }
+ return true;
+}
- // -------------- (int,double)
- template <int RTYPE>
- inline size_t hash_int_double( JoinVisitorImpl<RTYPE,REALSXP>& joiner, int i ){
- if( i>=0 ){
- int val = joiner.left[i] ;
- if( val == NA_INTEGER ) return joiner.RHS_hash_fun( NA_REAL );
- return joiner.RHS_hash_fun( (double)val );
- }
- return joiner.RHS_hash_fun( joiner.right[-i-1] ) ;
- }
- template <>
- inline size_t JoinVisitorImpl<INTSXP,REALSXP>::hash(int i){
- return hash_int_double<INTSXP>( *this, i );
- }
- template <>
- inline size_t JoinVisitorImpl<LGLSXP,REALSXP>::hash(int i){
- return hash_int_double<LGLSXP>( *this, i );
- }
-
-
- template <int RTYPE>
- inline SEXP subset_join_int_double( JoinVisitorImpl<RTYPE,REALSXP>& joiner, const std::vector<int>& indices ){
- int n = indices.size() ;
- NumericVector res = no_init(n) ;
- for( int i=0; i<n; i++) {
- int index = indices[i] ;
- if( index >= 0 ){
- res[i] = Rcpp::internal::r_coerce<INTSXP,REALSXP>( joiner.left[index] ) ;
- } else {
- res[i] = joiner.right[-index-1] ;
- }
- }
- return res ;
- }
- template <>
- inline SEXP JoinVisitorImpl<INTSXP,REALSXP>::subset( const std::vector<int>& indices ){
- return subset_join_int_double<INTSXP>( *this, indices ) ;
- }
- template <>
- inline SEXP JoinVisitorImpl<LGLSXP,REALSXP>::subset( const std::vector<int>& indices ){
- return subset_join_int_double<LGLSXP>( *this, indices ) ;
- }
-
+void warn_bad_var(const SymbolString& var_left, const SymbolString& var_right,
+ std::string message, bool warn = true) {
+ if (!warn)
+ return;
+
+ if (var_left == var_right) {
+ std::string var_utf8 = var_left.get_utf8_cstring();
+ Rf_warningcall(
+ R_NilValue,
+ "Column `%s` %s",
+ var_utf8.c_str(),
+ message.c_str()
+ );
+ } else {
+ std::string left_utf8 = var_left.get_utf8_cstring();
+ std::string right_utf8 = var_right.get_utf8_cstring();
+ Rf_warningcall(
+ R_NilValue,
+ "Column `%s`/`%s` %s",
+ left_utf8.c_str(),
+ right_utf8.c_str(),
+ message.c_str()
+ );
+ }
- template <int RTYPE>
- inline SEXP subset_join_int_double( JoinVisitorImpl<RTYPE,REALSXP>& joiner, const VisitorSetIndexSet<DataFrameJoinVisitors>& set ){
- int n = set.size() ;
- NumericVector res = no_init(n) ;
- VisitorSetIndexSet<DataFrameJoinVisitors>::const_iterator it=set.begin() ;
- for( int i=0; i<n; i++, ++it) {
- int index = *it ;
- if( index >= 0){
- res[i] = Rcpp::internal::r_coerce<INTSXP,REALSXP>( joiner.left[index] ) ;
- } else {
- res[i] = joiner.right[-index-1] ;
- }
- }
- return res ;
- }
- template <>
- inline SEXP JoinVisitorImpl<INTSXP,REALSXP>::subset( const VisitorSetIndexSet<DataFrameJoinVisitors>& set ){
- return subset_join_int_double<INTSXP>(*this, set );
- }
- template <>
- inline SEXP JoinVisitorImpl<LGLSXP,REALSXP>::subset( const VisitorSetIndexSet<DataFrameJoinVisitors>& set ){
- return subset_join_int_double<LGLSXP>(*this, set );
- }
-
- // -------------- (double,int)
- template <int RTYPE>
- inline size_t hash_double_int( JoinVisitorImpl<REALSXP,RTYPE>& joiner, int i ){
- // if(i < 0) we need to take data in right
- if( i<0 ){
- int val = joiner.right[-i-1] ;
- if( val == NA_INTEGER ) return joiner.LHS_hash_fun( NA_REAL );
- return joiner.LHS_hash_fun( (double)val );
- }
- // otherwise take data in left
- return joiner.LHS_hash_fun( joiner.left[i] ) ;
- }
- template <>
- inline size_t JoinVisitorImpl<REALSXP,INTSXP>::hash(int i){
- size_t res = hash_double_int<INTSXP>( *this, i );
- return res ;
- }
- template <>
- inline size_t JoinVisitorImpl<REALSXP,LGLSXP>::hash(int i){
- return hash_double_int<LGLSXP>( *this, i );
- }
+}
+void check_attribute_compatibility(const Column& left, const Column& right) {
+ // Rely on R function based on all.equal
+ static Function attr_equal = Function("attr_equal", Environment::namespace_env("dplyr"));
+ bool ok = as<bool>(attr_equal(left.get_data(), right.get_data()));
+ if (!ok) {
+ warn_bad_var(left.get_name(), right.get_name(), "has different attributes on LHS and RHS of join");
+ }
+}
- template <int RTYPE>
- inline SEXP subset_join_double_int( JoinVisitorImpl<REALSXP,RTYPE>& joiner, const std::vector<int>& indices ){
- int n = indices.size() ;
- NumericVector res = no_init(n) ;
- for( int i=0; i<n; i++) {
- int index = indices[i] ;
- if( index < 0 ){
- res[i] = Rcpp::internal::r_coerce<INTSXP,REALSXP>( joiner.right[-index-1] ) ;
- } else {
- res[i] = joiner.left[index] ;
- }
- }
- return res ;
- }
- template <>
- inline SEXP JoinVisitorImpl<REALSXP,INTSXP>::subset( const std::vector<int>& indices ){
- return subset_join_double_int<INTSXP>( *this, indices ) ;
- }
- template <>
- inline SEXP JoinVisitorImpl<REALSXP,LGLSXP>::subset( const std::vector<int>& indices ){
- return subset_join_double_int<LGLSXP>( *this, indices ) ;
- }
+template <int LHS_RTYPE, bool ACCEPT_NA_MATCH>
+JoinVisitor* date_join_visitor_right(const Column& left, const Column& right) {
+ switch (TYPEOF(right.get_data())) {
+ case INTSXP:
+ return new DateJoinVisitor<LHS_RTYPE, INTSXP, ACCEPT_NA_MATCH>(left, right);
+ case REALSXP:
+ return new DateJoinVisitor<LHS_RTYPE, REALSXP, ACCEPT_NA_MATCH>(left, right);
+ default:
+ stop("Date objects should be represented as integer or numeric");
+ }
+}
+template <bool ACCEPT_NA_MATCH>
+JoinVisitor* date_join_visitor(const Column& left, const Column& right) {
+ switch (TYPEOF(left.get_data())) {
+ case INTSXP:
+ return date_join_visitor_right<INTSXP, ACCEPT_NA_MATCH>(left, right);
+ case REALSXP:
+ return date_join_visitor_right<REALSXP, ACCEPT_NA_MATCH>(left, right);
+ default:
+ stop("Date objects should be represented as integer or numeric");
+ }
+}
- template <int RTYPE>
- inline SEXP subset_join_double_int( JoinVisitorImpl<REALSXP,RTYPE>& joiner, const VisitorSetIndexSet<DataFrameJoinVisitors>& set ){
- int n = set.size() ;
- NumericVector res = no_init(n) ;
- VisitorSetIndexSet<DataFrameJoinVisitors>::const_iterator it=set.begin() ;
- for( int i=0; i<n; i++, ++it) {
- int index = *it ;
- if( index < 0){
- res[i] = Rcpp::internal::r_coerce<INTSXP,REALSXP>( joiner.right[-index-1] ) ;
- } else {
- res[i] = joiner.left[index] ;
- }
+template <bool ACCEPT_NA_MATCH>
+JoinVisitor* join_visitor(const Column& left, const Column& right, bool warn_) {
+ // handle Date separately
+ bool lhs_date = Rf_inherits(left.get_data(), "Date");
+ bool rhs_date = Rf_inherits(right.get_data(), "Date");
+
+ switch (lhs_date + rhs_date) {
+ case 2:
+ return date_join_visitor<ACCEPT_NA_MATCH>(left, right);
+ case 1:
+ stop("cannot join a Date object with an object that is not a Date object");
+ case 0:
+ break;
+ default:
+ break;
+ }
+
+ bool lhs_time = Rf_inherits(left.get_data(), "POSIXct");
+ bool rhs_time = Rf_inherits(right.get_data(), "POSIXct");
+ switch (lhs_time + rhs_time) {
+ case 2:
+ return new POSIXctJoinVisitor<ACCEPT_NA_MATCH>(left, right);
+ case 1:
+ stop("cannot join a POSIXct object with an object that is not a POSIXct object");
+ case 0:
+ break;
+ default:
+ break;
+ }
+
+ switch (TYPEOF(left.get_data())) {
+ case CPLXSXP:
+ {
+ switch (TYPEOF(right.get_data())) {
+ case CPLXSXP:
+ return new JoinVisitorImpl<CPLXSXP, CPLXSXP, ACCEPT_NA_MATCH>(left, right, warn_);
+ default:
+ break;
+ }
+ break;
+ }
+ case INTSXP:
+ {
+ bool lhs_factor = Rf_inherits(left.get_data(), "factor");
+ switch (TYPEOF(right.get_data())) {
+ case INTSXP:
+ {
+ bool rhs_factor = Rf_inherits(right.get_data(), "factor");
+ if (lhs_factor && rhs_factor) {
+ if (same_levels(left.get_data(), right.get_data())) {
+ return new JoinVisitorImpl<INTSXP, INTSXP, ACCEPT_NA_MATCH>(left, right, warn_);
+ } else {
+ warn_bad_var(
+ left.get_name(), right.get_name(),
+ "joining factors with different levels, coercing to character vector",
+ warn_
+ );
+ return
+ new JoinVisitorImpl<STRSXP, STRSXP, ACCEPT_NA_MATCH>(
+ left.update_data(reencode_char(left.get_data())),
+ right.update_data(reencode_char(right.get_data())),
+ warn_
+ );
}
- return res ;
- }
- template <>
- inline SEXP JoinVisitorImpl<REALSXP,INTSXP>::subset( const VisitorSetIndexSet<DataFrameJoinVisitors>& set ){
- return subset_join_double_int<INTSXP>(*this, set );
- }
- template <>
- inline SEXP JoinVisitorImpl<REALSXP,LGLSXP>::subset( const VisitorSetIndexSet<DataFrameJoinVisitors>& set ){
- return subset_join_double_int<LGLSXP>(*this, set );
- }
-
-
-
- // -----------------
- inline void incompatible_join_visitor(SEXP left, SEXP right, const std::string& name_left, const std::string& name_right) {
- stop( "Can't join on '%s' x '%s' because of incompatible types (%s / %s)",
- name_left, name_right, get_single_class(left), get_single_class(right)
- ) ;
- }
-
- int count_attributes( SEXP x) {
- int n = 0 ;
-
- while( ! Rf_isNull(x) ){
- SEXP name = TAG(x) ;
- if( name != R_NamesSymbol && name != R_DimSymbol ) n++ ;
- x = CDR(x) ;
+ } else if (!lhs_factor && !rhs_factor) {
+ return new JoinVisitorImpl<INTSXP, INTSXP, ACCEPT_NA_MATCH>(left, right, warn_);
}
-
- return n ;
+ break;
}
-
- SEXP grab_attribute( SEXP name, SEXP x){
- while( !Rf_isNull(x) ){
- if( TAG(x) == name ) return CAR(x) ;
- x = CDR(x) ;
+ case REALSXP:
+ {
+ if (!lhs_factor && is_bare_vector(right.get_data())) {
+ return new JoinVisitorImpl<INTSXP, REALSXP, ACCEPT_NA_MATCH>(left, right, warn_);
}
- stop( "cannot find attribute '%s' ", CHAR(PRINTNAME(name)) ) ;
- return x;
+ break;
+ // what else: perhaps we can have INTSXP which is a Date and REALSXP which is a Date too ?
}
-
- void check_attribute_compatibility( SEXP left, SEXP right){
- SEXP att_left = ATTRIB(left) ;
- SEXP att_right = ATTRIB(right) ;
- int n_left = count_attributes(att_left) ;
- int n_right = count_attributes(att_right) ;
-
- if( n_left != n_right)
- stop("atributes of different sizes") ;
-
- List list_left(n_left), list_right(n_left) ;
-
- SEXP p_left = att_left ;
- int i = 0 ;
- while( !Rf_isNull(p_left) ){
- SEXP name = TAG(p_left) ;
- if( name != R_NamesSymbol && name != R_DimSymbol){
- list_left[i] = CAR(p_left) ;
- list_right[i] = grab_attribute( name, att_right ) ;
- }
- p_left = CDR(p_left) ;
+ case LGLSXP:
+ {
+ if (!lhs_factor) {
+ return new JoinVisitorImpl<INTSXP, LGLSXP, ACCEPT_NA_MATCH>(left, right, warn_);
}
- RObject test = Language( "all.equal", list_left, list_right ).fast_eval() ;
- if( !is<bool>(test) || !as<bool>(test) ){
- stop("attributes are different") ;
+ break;
+ }
+ case STRSXP:
+ {
+ if (lhs_factor) {
+ warn_bad_var(
+ left.get_name(), right.get_name(),
+ "joining factor and character vector, coercing into character vector",
+ warn_
+ );
+ return
+ new JoinVisitorImpl<STRSXP, STRSXP, ACCEPT_NA_MATCH>(
+ left.update_data(reencode_char(left.get_data())),
+ right.update_data(reencode_char(right.get_data())),
+ warn_
+ );
}
-
}
+ default:
+ break;
+ }
+ break;
+ }
+ case REALSXP:
+ {
+ switch (TYPEOF(right.get_data())) {
+ case REALSXP:
+ return new JoinVisitorImpl<REALSXP, REALSXP, ACCEPT_NA_MATCH>(left, right, warn_);
+ case INTSXP:
+ return new JoinVisitorImpl<REALSXP, INTSXP, ACCEPT_NA_MATCH>(left, right, warn_);
+ default:
+ break;
+ }
+
+ }
+ case LGLSXP:
+ {
+ switch (TYPEOF(right.get_data())) {
+ case LGLSXP:
+ return new JoinVisitorImpl<LGLSXP, LGLSXP, ACCEPT_NA_MATCH> (left, right, warn_);
+ case INTSXP:
+ return new JoinVisitorImpl<LGLSXP, INTSXP, ACCEPT_NA_MATCH>(left, right, warn_);
+ case REALSXP:
+ return new JoinVisitorImpl<LGLSXP, REALSXP, ACCEPT_NA_MATCH>(left, right, warn_);
+ default:
+ break;
+ }
+ break;
+ }
+ case STRSXP:
+ {
+ switch (TYPEOF(right.get_data())) {
+ case INTSXP:
+ {
+ if (Rf_inherits(right.get_data(), "factor")) {
+ warn_bad_var(
+ left.get_name(), right.get_name(),
+ "joining character vector and factor, coercing into character vector",
+ warn_
+ );
+ return
+ new JoinVisitorImpl<STRSXP, STRSXP, ACCEPT_NA_MATCH>(
+ left.update_data(reencode_char(left.get_data())),
+ right.update_data(reencode_char(right.get_data())),
+ warn_
+ );
+ }
+ break;
+ }
+ case STRSXP:
+ {
+ return
+ new JoinVisitorImpl<STRSXP, STRSXP, ACCEPT_NA_MATCH>(
+ left.update_data(reencode_char(left.get_data())),
+ right.update_data(reencode_char(right.get_data())),
+ warn_
+ );
+ }
+ default:
+ break;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+
+ stop(
+ "Can't join on '%s' x '%s' because of incompatible types (%s / %s)",
+ left.get_name().get_utf8_cstring(), right.get_name().get_utf8_cstring(),
+ get_single_class(left.get_data()), get_single_class(right.get_data())
+ );
+}
- JoinVisitor* join_visitor( SEXP left, SEXP right, const std::string& name_left, const std::string& name_right, bool warn_ ){
- // handle Date separately
- bool lhs_date = Rf_inherits( left, "Date") ;
- bool rhs_date = Rf_inherits( right, "Date") ;
-
- switch( lhs_date + rhs_date ){
- case 2: return new DateJoinVisitor( left, right ) ;
- case 1: stop( "cannot join a Date object with an object that is not a Date object" ) ;
- case 0: break ;
- default: break ;
- }
-
- bool lhs_time = Rf_inherits( left, "POSIXct" );
- bool rhs_time = Rf_inherits( right, "POSIXct" );
- switch( lhs_time + rhs_time ){
- case 2: return new POSIXctJoinVisitor( left, right) ;
- case 1: stop( "cannot join a POSIXct object with an object that is not a POSIXct object" ) ;
- case 0: break;
- default: break ;
- }
-
- switch( TYPEOF(left) ){
- case CPLXSXP:
- {
- switch( TYPEOF(right) ){
- case CPLXSXP: return new JoinVisitorImpl<CPLXSXP, CPLXSXP>( left, right ) ;
- default:
- break ;
- }
- break ;
- }
- case INTSXP:
- {
- bool lhs_factor = Rf_inherits( left, "factor" ) ;
- switch( TYPEOF(right) ){
- case INTSXP:
- {
- bool rhs_factor = Rf_inherits( right, "factor" ) ;
- if( lhs_factor && rhs_factor){
- if( same_levels(left, right) ){
- return new JoinVisitorImpl<INTSXP, INTSXP>( left, right) ;
- } else {
- if(warn_) Rf_warning( "joining factors with different levels, coercing to character vector" );
- return new JoinFactorFactorVisitor(left, right) ;
- }
- } else if( !lhs_factor && !rhs_factor) {
- return new JoinVisitorImpl<INTSXP, INTSXP>( left, right ) ;
- }
- break ;
- }
- case REALSXP:
- {
- if( lhs_factor ){
- incompatible_join_visitor(left, right, name_left, name_right) ;
- } else if( is_bare_vector(right) ) {
- return new JoinVisitorImpl<INTSXP, REALSXP>( left, right) ;
- } else {
- incompatible_join_visitor(left, right, name_left, name_right) ;
- }
- break ;
- // what else: perhaps we can have INTSXP which is a Date and REALSXP which is a Date too ?
- }
- case LGLSXP:
- {
- if( lhs_factor ){
- incompatible_join_visitor(left, right, name_left, name_right) ;
- } else {
- return new JoinVisitorImpl<INTSXP, LGLSXP>( left, right) ;
- }
- break ;
- }
- case STRSXP:
- {
- if( lhs_factor ){
- if(warn_) Rf_warning( "joining factor and character vector, coercing into character vector" ) ;
- return new JoinFactorStringVisitor( left, right );
- }
- }
- default: break ;
- }
- break ;
- }
- case REALSXP:
- {
- switch( TYPEOF(right) ){
- case REALSXP:
- return new JoinVisitorImpl<REALSXP, REALSXP>( left, right) ;
- case INTSXP:
- return new JoinVisitorImpl<REALSXP, INTSXP>( left, right) ;
- default: break ;
- }
-
- }
- case LGLSXP:
- {
- switch( TYPEOF(right) ){
- case LGLSXP:
- return new JoinVisitorImpl<LGLSXP,LGLSXP> ( left, right ) ;
- case INTSXP:
- return new JoinVisitorImpl<LGLSXP,INTSXP>( left, right ) ;
- case REALSXP:
- return new JoinVisitorImpl<LGLSXP,REALSXP>( left, right ) ;
- default: break ;
- }
- break ;
- }
- case STRSXP:
- {
- switch( TYPEOF(right) ){
- case INTSXP:
- {
- if( Rf_inherits(right, "factor" ) ){
- if(warn_) Rf_warning( "joining character vector and factor, coercing into character vector" ) ;
- return new JoinStringFactorVisitor( left, right ) ;
- }
- break ;
- }
- case STRSXP:
- {
- return new JoinStringStringVisitor( left, right ) ;
- }
- default: break ;
- }
- break ;
- }
- default: break ;
- }
-
- incompatible_join_visitor(left, right, name_left, name_right) ;
- return 0 ;
- }
+JoinVisitor* join_visitor(const Column& left, const Column& right, bool warn, bool accept_na_match) {
+ if (accept_na_match)
+ return join_visitor<true>(left, right, warn);
+ else
+ return join_visitor<false>(left, right, warn);
+}
}
diff --git a/src/join_exports.cpp b/src/join_exports.cpp
new file mode 100644
index 0000000..336fd41
--- /dev/null
+++ b/src/join_exports.cpp
@@ -0,0 +1,390 @@
+#include "pch.h"
+#include <dplyr/main.h>
+
+#include <tools/hash.h>
+#include <tools/match.h>
+
+#include <tools/Quosure.h>
+
+#include <dplyr/visitor_set/VisitorSetIndexMap.h>
+
+#include <dplyr/GroupedDataFrame.h>
+
+#include <dplyr/DataFrameJoinVisitors.h>
+
+#include <dplyr/train.h>
+
+#include <dplyr/bad.h>
+
+using namespace Rcpp;
+using namespace dplyr;
+
+template <typename Index>
+DataFrame subset_join(DataFrame x, DataFrame y,
+ const Index& indices_x, const Index& indices_y,
+ CharacterVector by_x, CharacterVector by_y,
+ const std::string& suffix_x, const std::string& suffix_y,
+ CharacterVector classes) {
+ if (suffix_x.length() == 0 && suffix_y.length() == 0) {
+ bad_arg("suffix", "can't be empty string for both `x` and `y` suffixes");
+ }
+
+ // first the joined columns
+ DataFrameJoinVisitors join_visitors(x, y, SymbolVector(by_x), SymbolVector(by_y), true, false);
+ int n_join_visitors = join_visitors.size();
+
+ // then columns from x but not y
+ CharacterVector all_x_columns = x.names();
+ std::vector<bool> joiner(all_x_columns.size());
+ CharacterVector x_columns(all_x_columns.size() - n_join_visitors);
+ IntegerVector xm = r_match(all_x_columns, by_x);
+ for (int i = 0, k = 0; i < all_x_columns.size(); i++) {
+ if (xm[i] == NA_INTEGER) {
+ joiner[i] = false;
+ x_columns[k++] = all_x_columns[i];
+ } else {
+ joiner[i] = true;
+ }
+ }
+ DataFrameSubsetVisitors visitors_x(x, SymbolVector(x_columns));
+ int nv_x = visitors_x.size();
+
+ // then columns from y but not x
+ CharacterVector all_y_columns = y.names();
+ CharacterVector y_columns(all_y_columns.size() - n_join_visitors);
+ IntegerVector ym = r_match(all_y_columns, by_y);
+ for (int i = 0, k = 0; i < all_y_columns.size(); i++) {
+ if (ym[i] == NA_INTEGER) {
+ y_columns[k++] = all_y_columns[i];
+ }
+ }
+ DataFrameSubsetVisitors visitors_y(y, SymbolVector(y_columns));
+
+ int nv_y = visitors_y.size();
+
+ // construct out object
+ int nrows = indices_x.size();
+ List out(n_join_visitors + nv_x + nv_y);
+ CharacterVector names(n_join_visitors + nv_x + nv_y);
+
+ int index_join_visitor = 0;
+ int index_x_visitor = 0;
+ // ---- join visitors
+ for (int i = 0; i < all_x_columns.size(); i++) {
+ String col_name = all_x_columns[i];
+ if (joiner[i]) {
+ JoinVisitor* v = join_visitors.get(xm[i] - 1);
+ out[i] = v->subset(indices_x);
+ index_join_visitor++;
+ } else {
+
+ // we suffix by .x if this column is in y_columns (and if the suffix is not empty)
+ if (suffix_x.length() > 0) {
+ while (
+ (std::find(y_columns.begin(), y_columns.end(), col_name.get_sexp()) != y_columns.end()) ||
+ (std::find(names.begin(), names.begin() + i, col_name.get_sexp()) != names.begin() + i)
+ ) {
+ col_name += suffix_x;
+ }
+ }
+
+ out[i] = visitors_x.get(index_x_visitor)->subset(indices_x);
+ index_x_visitor++;
+ }
+ names[i] = col_name;
+ }
+
+ int k = index_join_visitor + index_x_visitor;
+ for (int i = 0; i < nv_y; i++, k++) {
+ String col_name = y_columns[i];
+
+ // we suffix by .y if this column is in x_columns (and if the suffix is not empty)
+ if (suffix_y.length() > 0) {
+ while (
+ (std::find(all_x_columns.begin(), all_x_columns.end(), col_name.get_sexp()) != all_x_columns.end()) ||
+ (std::find(names.begin(), names.begin() + k, col_name.get_sexp()) != names.begin() + k)
+ ) {
+ col_name += suffix_y;
+ }
+ }
+
+ out[k] = visitors_y.get(i)->subset(indices_y);
+ names[k] = col_name;
+ }
+ set_class(out, classes);
+ set_rownames(out, nrows);
+ out.names() = names;
+
+ // out group columns
+ SymbolVector group_cols_x = get_vars(x);
+ int n_group_cols = group_cols_x.size();
+ SymbolVector group_cols(n_group_cols);
+ IntegerVector group_col_indices = group_cols_x.match_in_table(all_x_columns);
+ // get updated column names
+ for (int i = 0; i < n_group_cols; i++) {
+ int group_col_index = group_col_indices[i];
+ if (group_col_index != NA_INTEGER) {
+ group_cols.set(i, names[group_col_index - 1]);
+ } else {
+ stop("unknown group column '%s'", group_cols_x[i].get_utf8_cstring());
+ }
+ }
+ set_vars(out, group_cols);
+
+ return (SEXP)out;
+}
+
+template <typename TargetContainer, typename SourceContainer>
+void push_back(TargetContainer& x, const SourceContainer& y) {
+ x.insert(x.end(), y.begin(), y.end());
+}
+template <typename TargetContainer, typename SourceContainer>
+void push_back_right(TargetContainer& x, const SourceContainer& y) {
+ // x.insert( x.end(), y.begin(), y.end() );
+ int n = y.size();
+ for (int i = 0; i < n; i++) {
+ x.push_back(-y[i] - 1);
+ }
+}
+
+template <typename Container>
+void push_back(Container& x, typename Container::value_type value, int n) {
+ for (int i = 0; i < n; i++)
+ x.push_back(value);
+}
+
+void check_by(const CharacterVector& by) {
+ if (by.size() == 0) bad_arg("by", "must specify variables to join by");
+}
+
+// [[Rcpp::export]]
+DataFrame semi_join_impl(DataFrame x, DataFrame y, CharacterVector by_x, CharacterVector by_y, bool na_match) {
+ check_by(by_x);
+
+ typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map;
+ DataFrameJoinVisitors visitors(x, y, SymbolVector(by_x), SymbolVector(by_y), true, na_match);
+ Map map(visitors);
+
+ // train the map in terms of x
+ train_push_back(map, x.nrows());
+
+ int n_y = y.nrows();
+ // this will collect indices from rows in x that match rows in y
+ std::vector<int> indices;
+ indices.reserve(x.nrows());
+ for (int i = 0; i < n_y; i++) {
+ // find a row in x that matches row i from y
+ Map::iterator it = map.find(-i - 1);
+
+ if (it != map.end()) {
+ // collect the indices and remove them from the
+ // map so that they are only found once.
+ push_back(indices, it->second);
+
+ map.erase(it);
+
+ }
+ }
+
+ std::sort(indices.begin(), indices.end());
+
+ const DataFrame& out = subset(x, indices, x.names(), get_class(x));
+ strip_index(out);
+ return out;
+}
+
+// [[Rcpp::export]]
+DataFrame anti_join_impl(DataFrame x, DataFrame y, CharacterVector by_x, CharacterVector by_y, bool na_match) {
+ check_by(by_x);
+
+ typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map;
+ DataFrameJoinVisitors visitors(x, y, SymbolVector(by_x), SymbolVector(by_y), true, na_match);
+ Map map(visitors);
+
+ // train the map in terms of x
+ train_push_back(map, x.nrows());
+
+ int n_y = y.nrows();
+ // remove the rows in x that match
+ for (int i = 0; i < n_y; i++) {
+ Map::iterator it = map.find(-i - 1);
+ if (it != map.end())
+ map.erase(it);
+ }
+
+ // collect what's left
+ std::vector<int> indices;
+ indices.reserve(map.size());
+ for (Map::iterator it = map.begin(); it != map.end(); ++it)
+ push_back(indices, it->second);
+
+ std::sort(indices.begin(), indices.end());
+
+ const DataFrame& out = subset(x, indices, x.names(), get_class(x));
+ strip_index(out);
+ return out;
+}
+
+// [[Rcpp::export]]
+DataFrame inner_join_impl(DataFrame x, DataFrame y,
+ CharacterVector by_x, CharacterVector by_y,
+ std::string& suffix_x, std::string& suffix_y,
+ bool na_match) {
+ check_by(by_x);
+
+ typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map;
+ DataFrameJoinVisitors visitors(x, y, SymbolVector(by_x), SymbolVector(by_y), false, na_match);
+ Map map(visitors);
+
+ int n_x = x.nrows(), n_y = y.nrows();
+
+ std::vector<int> indices_x;
+ std::vector<int> indices_y;
+
+ train_push_back_right(map, n_y);
+
+ for (int i = 0; i < n_x; i++) {
+ Map::iterator it = map.find(i);
+ if (it != map.end()) {
+ push_back_right(indices_y, it->second);
+ push_back(indices_x, i, it->second.size());
+ }
+ }
+
+ return subset_join(x, y,
+ indices_x, indices_y,
+ by_x, by_y,
+ suffix_x, suffix_y,
+ get_class(x)
+ );
+}
+
+// [[Rcpp::export]]
+DataFrame left_join_impl(DataFrame x, DataFrame y,
+ CharacterVector by_x, CharacterVector by_y,
+ std::string& suffix_x, std::string& suffix_y,
+ bool na_match) {
+ check_by(by_x);
+
+ typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map;
+ DataFrameJoinVisitors visitors(y, x, SymbolVector(by_y), SymbolVector(by_x), false, na_match);
+
+ Map map(visitors);
+
+ // train the map in terms of y
+ train_push_back(map, y.nrows());
+
+ std::vector<int> indices_x;
+ std::vector<int> indices_y;
+
+ int n_x = x.nrows();
+ for (int i = 0; i < n_x; i++) {
+ // find a row in y that matches row i in x
+ Map::iterator it = map.find(-i - 1);
+ if (it != map.end()) {
+ push_back(indices_y, it->second);
+ push_back(indices_x, i, it->second.size());
+ } else {
+ indices_y.push_back(-1); // mark NA
+ indices_x.push_back(i);
+ }
+ }
+
+ return subset_join(x, y,
+ indices_x, indices_y,
+ by_x, by_y,
+ suffix_x, suffix_y,
+ get_class(x)
+ );
+}
+
+// [[Rcpp::export]]
+DataFrame right_join_impl(DataFrame x, DataFrame y,
+ CharacterVector by_x, CharacterVector by_y,
+ std::string& suffix_x, std::string& suffix_y,
+ bool na_match) {
+ check_by(by_x);
+
+ typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map;
+ DataFrameJoinVisitors visitors(x, y, SymbolVector(by_x), SymbolVector(by_y), false, na_match);
+ Map map(visitors);
+
+ // train the map in terms of x
+ train_push_back(map, x.nrows());
+
+ std::vector<int> indices_x;
+ std::vector<int> indices_y;
+
+ int n_y = y.nrows();
+ for (int i = 0; i < n_y; i++) {
+ // find a row in y that matches row i in x
+ Map::iterator it = map.find(-i - 1);
+ if (it != map.end()) {
+ push_back(indices_x, it->second);
+ push_back(indices_y, i, it->second.size());
+ } else {
+ indices_x.push_back(-i - 1); // point to the i-th row in the right table
+ indices_y.push_back(i);
+ }
+ }
+ return subset_join(x, y,
+ indices_x, indices_y,
+ by_x, by_y,
+ suffix_x, suffix_y,
+ get_class(x)
+ );
+}
+
+// [[Rcpp::export]]
+DataFrame full_join_impl(DataFrame x, DataFrame y,
+ CharacterVector by_x, CharacterVector by_y,
+ std::string& suffix_x, std::string& suffix_y,
+ bool na_match) {
+ check_by(by_x);
+
+ typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map;
+ DataFrameJoinVisitors visitors(y, x, SymbolVector(by_y), SymbolVector(by_x), false, na_match);
+ Map map(visitors);
+
+ // train the map in terms of y
+ train_push_back(map, y.nrows());
+
+ std::vector<int> indices_x;
+ std::vector<int> indices_y;
+
+ int n_x = x.nrows(), n_y = y.nrows();
+
+ // get both the matches and the rows from left but not right
+ for (int i = 0; i < n_x; i++) {
+ // find a row in y that matches row i in x
+ Map::iterator it = map.find(-i - 1);
+ if (it != map.end()) {
+ push_back(indices_y, it->second);
+ push_back(indices_x, i, it->second.size());
+ } else {
+ indices_y.push_back(-1); // mark NA
+ indices_x.push_back(i);
+ }
+ }
+
+ // train a new map in terms of x this time
+ DataFrameJoinVisitors visitors2(x, y, SymbolVector(by_x), SymbolVector(by_y), false, na_match);
+ Map map2(visitors2);
+ train_push_back(map2, x.nrows());
+
+ for (int i = 0; i < n_y; i++) {
+ // try to find row in x that matches this row of y
+ Map::iterator it = map2.find(-i - 1);
+ if (it == map2.end()) {
+ indices_x.push_back(-i - 1);
+ indices_y.push_back(i);
+ }
+ }
+
+ return subset_join(x, y,
+ indices_x, indices_y,
+ by_x, by_y,
+ suffix_x, suffix_y,
+ get_class(x)
+ );
+}
diff --git a/src/mutate.cpp b/src/mutate.cpp
new file mode 100644
index 0000000..4ea7546
--- /dev/null
+++ b/src/mutate.cpp
@@ -0,0 +1,212 @@
+#include "pch.h"
+#include <dplyr/main.h>
+
+#include <boost/scoped_ptr.hpp>
+
+#include <tools/Quosure.h>
+
+#include <dplyr/checks.h>
+
+#include <dplyr/GroupedDataFrame.h>
+
+#include <dplyr/Result/LazyRowwiseSubsets.h>
+#include <dplyr/Result/CallProxy.h>
+
+#include <dplyr/Gatherer.h>
+#include <dplyr/NamedListAccumulator.h>
+
+#include <dplyr/bad.h>
+
+using namespace Rcpp;
+using namespace dplyr;
+
+template <typename Data>
+SEXP structure_mutate(const NamedListAccumulator<Data>& accumulator,
+ const DataFrame& df,
+ CharacterVector classes,
+ bool grouped = true) {
+ List res = accumulator;
+ set_class(res, classes);
+ set_rownames(res, df.nrows());
+
+ if (grouped) {
+ copy_vars(res, df);
+ res.attr("labels") = df.attr("labels");
+ res.attr("index") = df.attr("index");
+ res.attr("indices") = df.attr("indices");
+ res.attr("drop") = df.attr("drop");
+ res.attr("group_sizes") = df.attr("group_sizes");
+ res.attr("biggest_group_size") = df.attr("biggest_group_size");
+ }
+
+ return res;
+}
+
+void check_not_groups(const QuosureList&, const RowwiseDataFrame&) {}
+
+void check_not_groups(const QuosureList& quosures, const GroupedDataFrame& gdf) {
+ int n = quosures.size();
+ for (int i = 0; i < n; i++) {
+ if (gdf.has_group(quosures[i].name()))
+ bad_col(quosures[i].name(), "can't be modified because it's a grouping variable");
+ }
+}
+
+static
+SEXP validate_unquoted_value(SEXP value, int nrows, SymbolString& name) {
+ if (is_vector(value))
+ check_length(Rf_length(value), nrows, "the number of rows", name);
+ else
+ bad_col(name, "is of unsupported type {type}", _["type"] = Rf_type2char(TYPEOF(value)));
+ return value;
+}
+
+DataFrame mutate_not_grouped(DataFrame df, const QuosureList& dots) {
+ const int nexpr = dots.size();
+ const int nrows = df.nrows();
+
+ NamedListAccumulator<DataFrame> accumulator;
+ const int nvars = df.size();
+ if (nvars) {
+ CharacterVector df_names = df.names();
+ for (int i = 0; i < nvars; i++) {
+ accumulator.set(df_names[i], df[i]);
+ }
+ }
+
+ CallProxy call_proxy(df);
+
+ for (int i = 0; i < nexpr; i++) {
+ Rcpp::checkUserInterrupt();
+ const NamedQuosure& quosure = dots[i];
+
+ Shield<SEXP> call_(quosure.expr());
+ SEXP call = call_;
+ SymbolString name = quosure.name();
+ Environment env = quosure.env();
+ call_proxy.set_env(env);
+
+ RObject variable;
+ if (TYPEOF(call) == SYMSXP) {
+ SymbolString call_name = SymbolString(Symbol(call));
+ if (call_proxy.has_variable(call_name)) {
+ variable = call_proxy.get_variable(call_name);
+ } else {
+ variable = shared_SEXP(env.find(call_name.get_string()));
+ }
+ } else if (TYPEOF(call) == LANGSXP) {
+ call_proxy.set_call(call);
+ variable = call_proxy.eval();
+ } else if (Rf_length(call) == 1) {
+ boost::scoped_ptr<Gatherer> gather(constant_gatherer(call, nrows, name));
+ variable = gather->collect();
+ } else if (Rf_isNull(call)) {
+ accumulator.rm(name);
+ continue;
+ } else {
+ variable = validate_unquoted_value(call, nrows, name);
+ }
+
+ if (Rf_inherits(variable, "POSIXlt")) {
+ bad_col(quosure.name(), "is of unsupported class POSIXlt");
+ }
+
+ const int n_res = Rf_length(variable);
+ check_supported_type(variable, name);
+ check_length(n_res, nrows, "the number of rows", name);
+
+ if (n_res == 1 && nrows != 1) {
+ // recycle
+ boost::scoped_ptr<Gatherer> gather(constant_gatherer(variable, nrows, name));
+ variable = gather->collect();
+ }
+
+ call_proxy.input(name, variable);
+ accumulator.set(name, variable);
+ }
+ List res = structure_mutate(accumulator, df, classes_not_grouped(), false);
+
+ return res;
+}
+
+template <typename Data, typename Subsets>
+DataFrame mutate_grouped(const DataFrame& df, const QuosureList& dots) {
+ LOG_VERBOSE << "checking zero rows";
+
+ // special 0 rows case
+ if (df.nrows() == 0) {
+ DataFrame res = mutate_not_grouped(df, dots);
+ copy_vars(res, df);
+ set_class(res, get_class(df));
+ return Data(res).data();
+ }
+
+ LOG_VERBOSE << "initializing proxy";
+
+ typedef GroupedCallProxy<Data, Subsets> Proxy;
+ Data gdf(df);
+ int nexpr = dots.size();
+ check_not_groups(dots, gdf);
+
+ Proxy proxy(gdf);
+
+ LOG_VERBOSE << "copying data to accumulator";
+
+ NamedListAccumulator<Data> accumulator;
+ int ncolumns = df.size();
+ CharacterVector column_names = df.names();
+ for (int i = 0; i < ncolumns; i++) {
+ accumulator.set(column_names[i], df[i]);
+ }
+
+ LOG_VERBOSE << "processing " << nexpr << " variables";
+
+ for (int i = 0; i < nexpr; i++) {
+ Rcpp::checkUserInterrupt();
+ const NamedQuosure& quosure = dots[i];
+
+ Environment env = quosure.env();
+ Shield<SEXP> call_(quosure.expr());
+ SEXP call = call_;
+ SymbolString name = quosure.name();
+ proxy.set_env(env);
+
+ LOG_VERBOSE << "processing " << name.get_utf8_cstring();
+
+ RObject variable;
+
+ if (TYPEOF(call) == LANGSXP || TYPEOF(call) == SYMSXP) {
+ proxy.set_call(call);
+ boost::scoped_ptr<Gatherer> gather(gatherer<Data, Subsets>(proxy, gdf, name));
+ variable = gather->collect();
+ } else if (Rf_length(call) == 1) {
+ boost::scoped_ptr<Gatherer> gather(constant_gatherer(call, gdf.nrows(), name));
+ variable = gather->collect();
+ } else if (Rf_isNull(call)) {
+ accumulator.rm(name);
+ continue;
+ } else {
+ variable = validate_unquoted_value(call, gdf.nrows(), name);
+ }
+
+ Rf_setAttrib(variable, R_NamesSymbol, R_NilValue);
+ proxy.input(name, variable);
+ accumulator.set(name, variable);
+ }
+
+ return structure_mutate(accumulator, df, get_class(df));
+}
+
+
+// [[Rcpp::export]]
+SEXP mutate_impl(DataFrame df, QuosureList dots) {
+ if (dots.size() == 0) return df;
+ check_valid_colnames(df);
+ if (is<RowwiseDataFrame>(df)) {
+ return mutate_grouped<RowwiseDataFrame, LazyRowwiseSubsets>(df, dots);
+ } else if (is<GroupedDataFrame>(df)) {
+ return mutate_grouped<GroupedDataFrame, LazyGroupedSubsets>(df, dots);
+ } else {
+ return mutate_not_grouped(df, dots);
+ }
+}
diff --git a/src/nth.cpp b/src/nth.cpp
deleted file mode 100644
index 156e6dc..0000000
--- a/src/nth.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-#include <dplyr.h>
-
-using namespace Rcpp ;
-using namespace dplyr ;
-
-namespace dplyr {
-
- template <int RTYPE>
- class Nth : public Processor< RTYPE, Nth<RTYPE> > {
- public:
- typedef Processor< RTYPE, Nth<RTYPE> > Base ;
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-
- Nth( Vector<RTYPE> data_, int idx_, STORAGE def_ = Vector<RTYPE>::get_na() ) :
- Base(data_),
- data(data_),
- idx(idx_),
- def(def_) {}
-
- inline STORAGE process_chunk( const SlicingIndex& indices ){
- int n = indices.size() ;
- if( n == 0 || idx > n || idx < -n ) return def ;
- int i = idx > 0 ? (idx -1) : (n+idx) ;
- return data[indices[i]] ;
- }
-
- private:
- Vector<RTYPE> data ;
- int idx ;
- STORAGE def ;
- } ;
-
- template <int RTYPE, int ORDER_RTYPE>
- class NthWith : public Processor< RTYPE, NthWith<RTYPE, ORDER_RTYPE> > {
- public:
- typedef Processor< RTYPE, NthWith<RTYPE, ORDER_RTYPE> > Base ;
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-
- NthWith( Vector<RTYPE> data_, int idx_, Vector<ORDER_RTYPE> order_, STORAGE def_ = Vector<RTYPE>::get_na() ) :
- Base(data_),
- data(data_),
- idx(idx_),
- order(order_),
- def(def_) {}
-
- inline STORAGE process_chunk( const SlicingIndex& indices ){
- int n = indices.size() ;
- if( n == 0 || idx > n || idx < -n) return def ;
-
- int i = idx > 0 ? (idx -1) : (n+idx) ;
-
- typedef VectorSliceVisitor<ORDER_RTYPE> Slice ;
- typedef OrderVectorVisitorImpl<ORDER_RTYPE,true,Slice> Visitor ;
- typedef Compare_Single_OrderVisitor<Visitor> Comparer ;
-
- Comparer comparer( Visitor( Slice(order, indices ) ) ) ;
- IntegerVector sequence = seq(0,n-1) ;
- std::nth_element( sequence.begin(), sequence.begin() + i, sequence.end(), comparer ) ;
-
- return data[ indices[ sequence[i] ] ] ;
- }
-
- private:
- Vector<RTYPE> data ;
- int idx ;
- Vector<ORDER_RTYPE> order ;
- STORAGE def ;
- } ;
-
-}
-
-template <int RTYPE>
-Result* nth_noorder_default( Vector<RTYPE> data, int idx, Vector<RTYPE> def ){
- return new Nth<RTYPE>(data, idx, def[0] );
-}
-
-template <int RTYPE>
-Result* nth_with( Vector<RTYPE> data, int idx, SEXP order ){
- switch( TYPEOF(order) ){
- case LGLSXP: return new NthWith<RTYPE, LGLSXP>( data, idx, order );
- case INTSXP: return new NthWith<RTYPE, INTSXP>( data, idx, order );
- case REALSXP: return new NthWith<RTYPE, REALSXP>( data, idx, order );
- case STRSXP: return new NthWith<RTYPE, STRSXP>( data, idx, order );
- default: break ;
- }
- stop("Unsupported vector type %s", Rf_type2char(TYPEOF(order))) ;
- return 0 ;
-}
-
-template <int RTYPE>
-Result* nth_with_default( Vector<RTYPE> data, int idx, SEXP order, Vector<RTYPE> def ){
- switch( TYPEOF(order) ){
- case LGLSXP: return new NthWith<RTYPE, LGLSXP>( data, idx, order, def[0] );
- case INTSXP: return new NthWith<RTYPE, INTSXP>( data, idx, order, def[0] );
- case REALSXP: return new NthWith<RTYPE, REALSXP>( data, idx, order, def[0] );
- case STRSXP: return new NthWith<RTYPE, STRSXP>( data, idx, order, def[0] );
- default: break ;
- }
- stop("Unsupported vector type %s", Rf_type2char(TYPEOF(order))) ;
- return 0 ;
-}
-
-Result* nth_prototype( SEXP call, const LazySubsets& subsets, int nargs){
- // has to have at least two arguments
- if( nargs < 2 ) return 0 ;
-
- SEXP tag = TAG(CDR(call)) ;
- if( tag != R_NilValue && tag != Rf_install("x") ){
- stop( "the first argument of 'nth' should be either 'x' or unnamed" ) ;
- }
- SEXP data = CADR(call) ;
- if( TYPEOF(data) == SYMSXP ) {
- if( ! subsets.count(data) ){
- stop( "could not find variable '%s'", CHAR(PRINTNAME(data)) );
- }
- data = subsets.get_variable(data) ;
- }
-
- tag = TAG(CDDR(call)) ;
- if( tag != R_NilValue && tag != Rf_install("n") ){
- stop( "the second argument of 'first' should be either 'n' or unnamed" ) ;
- }
- SEXP nidx = CADDR(call) ;
- if( ( TYPEOF(nidx) != REALSXP && TYPEOF(nidx) != INTSXP ) || LENGTH(nidx) != 1 ){
- // we only know how to handle the case where nidx is a length one
- // integer or numeric. In any other case, e.g. an expression for R to evaluate
- // we just fallback to R evaluation (#734)
- return 0 ;
- }
- int idx = as<int>(nidx) ;
-
- // easy case : just a single variable: first(x,n)
- if( nargs == 2 ){
- switch( TYPEOF(data) ){
- case INTSXP: return new Nth<INTSXP>(data, idx) ;
- case REALSXP: return new Nth<REALSXP>(data, idx) ;
- case STRSXP: return new Nth<STRSXP>(data, idx) ;
- case LGLSXP: return new Nth<LGLSXP>(data, idx) ;
- default: break ;
- }
- } else {
- // now get `order_by` and default
-
- SEXP order_by = R_NilValue ;
- SEXP def = R_NilValue ;
-
- SEXP p = CDR(CDDR(call)) ;
- while( p != R_NilValue ){
- SEXP tag = TAG(p) ;
- if( tag == R_NilValue ) stop( "all arguments of 'first' after the first one should be named" ) ;
- std::string argname = CHAR(PRINTNAME(tag));
- if( argmatch( "order_by", argname ) ){
- order_by = CAR(p) ;
- } else if( argmatch( "default", argname ) ){
- def = CAR(p) ;
- } else {
- stop("argument to 'first' does not match either 'default' or 'order_by' ") ;
- }
-
- p = CDR(p) ;
- }
-
-
- // handle cases
- if( def == R_NilValue ){
-
- // then we know order_by is not NULL, we only handle the case where
- // order_by is a symbol and that symbol is in the data
- if( TYPEOF(order_by) == SYMSXP && subsets.count(order_by) ){
- order_by = subsets.get_variable(order_by) ;
-
- switch( TYPEOF(data) ){
- case LGLSXP: return nth_with<LGLSXP>( data, idx, order_by ) ;
- case INTSXP: return nth_with<INTSXP>( data, idx, order_by ) ;
- case REALSXP: return nth_with<REALSXP>( data, idx, order_by ) ;
- case STRSXP: return nth_with<STRSXP>( data, idx, order_by ) ;
- default: break ;
- }
- }
- else {
- return 0;
- }
-
-
- } else {
- if( order_by == R_NilValue ){
- switch( TYPEOF(data) ){
- case LGLSXP: return nth_noorder_default<LGLSXP>(data, idx, def) ;
- case INTSXP: return nth_noorder_default<INTSXP>(data, idx, def) ;
- case REALSXP: return nth_noorder_default<REALSXP>(data, idx, def) ;
- case STRSXP: return nth_noorder_default<STRSXP>(data, idx, def) ;
- default: break ;
- }
- } else {
- if( TYPEOF(order_by) == SYMSXP && subsets.count(order_by) ){
- order_by = subsets.get_variable(order_by) ;
-
- switch( TYPEOF(data) ){
- case LGLSXP: return nth_with_default<LGLSXP>(data, idx, order_by, def) ;
- case INTSXP: return nth_with_default<INTSXP>(data, idx, order_by, def) ;
- case REALSXP: return nth_with_default<REALSXP>(data, idx, order_by, def) ;
- case STRSXP: return nth_with_default<STRSXP>(data, idx, order_by, def) ;
- default: break ;
- }
- }
- else {
- return 0;
- }
-
- }
- }
-
- }
- stop("Unsupported vector type %s", Rf_type2char(TYPEOF(data))) ;
- return 0;
-}
-
-Result* firstlast_prototype( SEXP call, const LazySubsets& subsets, int nargs, int pos){
- SEXP tail = CDDR(call) ;
-
- SETCAR(call, Rf_install("nth")) ;
-
- Pairlist p(pos) ;
- if( Rf_isNull(tail) ){
- SETCDR(CDR(call), p) ;
- } else {
- SETCDR(p, tail) ;
- SETCDR(CDR(call), p) ;
- }
- Result* res = nth_prototype(call, subsets, nargs + 1) ;
- return res ;
-}
-
-Result* first_prototype( SEXP call, const LazySubsets& subsets, int nargs ){
- return firstlast_prototype(call, subsets, nargs, 1) ;
-}
-
-Result* last_prototype( SEXP call, const LazySubsets& subsets, int nargs ){
- return firstlast_prototype(call, subsets, nargs, -1) ;
-}
diff --git a/src/pch.h b/src/pch.h
new file mode 100644
index 0000000..6af9b85
--- /dev/null
+++ b/src/pch.h
@@ -0,0 +1 @@
+#include <Rcpp.h>
diff --git a/src/rlang-export.c b/src/rlang-export.c
new file mode 100644
index 0000000..4d7a513
--- /dev/null
+++ b/src/rlang-export.c
@@ -0,0 +1,37 @@
+#define R_NO_REMAP
+#include <Rinternals.h>
+#include <Rversion.h>
+
+#include <tools/rlang-export.h>
+
+#if (defined(R_VERSION) && R_VERSION < R_Version(3, 4, 0))
+SEXP R_MakeExternalPtrFn(DL_FUNC p, SEXP tag, SEXP prot) {
+ fn_ptr ptr;
+ ptr.fn = p;
+ return R_MakeExternalPtr(ptr.p, tag, prot);
+}
+DL_FUNC R_ExternalPtrAddrFn(SEXP s) {
+ fn_ptr ptr;
+ ptr.p = EXTPTR_PTR(s);
+ return ptr.fn;
+}
+#endif
+
+SEXP rlang_namespace(const char* ns) {
+ SEXP call = PROTECT(Rf_lang2(Rf_install("getNamespace"), PROTECT(Rf_mkString(ns))));
+ SEXP ns_env = Rf_eval(call, R_BaseEnv);
+ UNPROTECT(2);
+ return ns_env;
+}
+
+void rlang_register_pointer(const char* ns, const char* ptr_name, DL_FUNC fn) {
+ SEXP ptr = PROTECT(R_MakeExternalPtrFn(fn, R_NilValue, R_NilValue));
+
+ SEXP ptr_obj = PROTECT(Rf_allocVector(VECSXP, 1));
+ SET_VECTOR_ELT(ptr_obj, 0, ptr);
+
+ Rf_setAttrib(ptr_obj, R_ClassSymbol, Rf_mkString("fn_pointer"));
+
+ Rf_defineVar(Rf_install(ptr_name), ptr_obj, PROTECT(rlang_namespace(ns)));
+ UNPROTECT(3);
+}
diff --git a/src/select.cpp b/src/select.cpp
index 2c02111..7d263cf 100644
--- a/src/select.cpp
+++ b/src/select.cpp
@@ -1,80 +1,86 @@
-#include <dplyr.h>
+#include "pch.h"
+#include <dplyr/main.h>
-using namespace Rcpp ;
-using namespace dplyr ;
+#include <tools/utils.h>
-SEXP select_not_grouped( const DataFrame& df, const CharacterVector& keep, CharacterVector new_names ){
- CharacterVector names = df.names() ;
- IntegerVector positions = match( keep, names );
- int n = keep.size() ;
- List res(n) ;
- for( int i=0; i<n; i++){
- int pos = positions[i] ;
- if( pos < 1 || pos > df.size() ){
- std::stringstream s ;
- if( pos == NA_INTEGER ){
- s << "NA" ;
- } else {
- s << pos ;
- }
- stop( "invalid column index : %d for variable: %s = %s",
- s.str(), CHAR((SEXP)new_names[i]), CHAR((SEXP)keep[i]) );
+#include <dplyr/GroupedDataFrame.h>
+
+using namespace Rcpp;
+using namespace dplyr;
+
+SEXP select_not_grouped(const DataFrame& df, const SymbolVector& keep, const SymbolVector& new_names) {
+ IntegerVector positions = keep.match_in_table(df.names());
+ int n = keep.size();
+ List res(n);
+ for (int i = 0; i < n; i++) {
+ int pos = positions[i];
+ if (pos < 1 || pos > df.size()) {
+ std::stringstream s;
+ if (pos == NA_INTEGER) {
+ s << "NA";
+ } else {
+ s << pos;
}
- res[i] = df[ pos-1 ] ;
+ stop("invalid column index : %d for variable: '%s' = '%s'",
+ s.str(), new_names[i].get_utf8_cstring(), keep[i].get_utf8_cstring());
+ }
+ res[i] = df[ pos - 1 ];
}
- copy_most_attributes(res, df) ;
- res.names() = new_names ;
- return res ;
+ copy_most_attributes(res, df);
+ res.names() = new_names;
+ return res;
}
-DataFrame select_grouped( GroupedDataFrame gdf, const CharacterVector& keep, CharacterVector new_names ){
- int n = keep.size() ;
- DataFrame copy = select_not_grouped( gdf.data(), keep, new_names );
+DataFrame select_grouped(GroupedDataFrame gdf, const SymbolVector& keep, const SymbolVector& new_names) {
+ DataFrame copy = select_not_grouped(gdf.data(), keep, new_names);
+
+ SymbolMap keep_map(keep);
// handle vars attribute : make a shallow copy of the list and alter
// its names attribute
- List vars = shallow_copy( copy.attr("vars") );
+ SymbolVector vars(get_vars(copy, true));
- int nv = vars.size() ;
- for( int i=0; i<nv; i++){
- SEXP s = PRINTNAME(vars[i]) ;
- int j = 0;
- for( ; j < n; j++){
- if( s == keep[j] ){
- vars[i] = Rf_installChar( new_names[j] );
- }
+ int nv = vars.size();
+ for (int i = 0; i < nv; i++) {
+ SymbolString s = vars[i];
+ SymbolMapIndex j = keep_map.get_index(s);
+ if (j.origin != NEW) {
+ vars.set(i, new_names[j.pos]);
}
}
- copy.attr("vars") = vars ;
+ set_vars(copy, vars);
- // hangle labels attribute
+ // handle labels attribute
// make a shallow copy of the data frame and alter its names attributes
- if( !Rf_isNull( copy.attr("labels" ) ) ){
+ if (!Rf_isNull(copy.attr("labels"))) {
- DataFrame original_labels( copy.attr("labels" ) ) ;
+ DataFrame original_labels(copy.attr("labels"));
- DataFrame labels( shallow_copy(original_labels));
- CharacterVector label_names = clone<CharacterVector>( labels.names() ) ;
+ DataFrame labels(shallow_copy(original_labels));
+ CharacterVector label_names = clone<CharacterVector>(labels.names());
- IntegerVector positions = match( label_names, keep );
- int nl = label_names.size() ;
- for( int i=0; i<nl; i++){
- label_names[i] = new_names[ positions[i]-1 ] ;
+ IntegerVector positions = keep.match(label_names);
+ int nl = label_names.size();
+ for (int i = 0; i < nl; i++) {
+ int pos = positions[i];
+ if (pos != NA_INTEGER) {
+ label_names[i] = new_names[pos - 1].get_string();
+ }
}
- labels.names() = label_names ;
- labels.attr("vars") = vars ;
- copy.attr("labels") = labels ;
+ labels.names() = label_names;
+ set_vars(labels, vars);
+ copy.attr("labels") = labels;
}
- return copy ;
+ return copy;
}
// [[Rcpp::export]]
-DataFrame select_impl( DataFrame df, CharacterVector vars ){
- check_valid_colnames(df) ;
- if( is<GroupedDataFrame>(df) ){
- return select_grouped( GroupedDataFrame(df), vars, vars.names() ) ;
+DataFrame select_impl(DataFrame df, CharacterVector vars) {
+ check_valid_colnames(df);
+ if (is<GroupedDataFrame>(df)) {
+ return select_grouped(GroupedDataFrame(df), SymbolVector(vars), SymbolVector(vars.names()));
} else {
- return select_not_grouped(df, vars, vars.names() ) ;
+ return select_not_grouped(df, SymbolVector(vars), SymbolVector(vars.names()));
}
}
diff --git a/src/set.cpp b/src/set.cpp
new file mode 100644
index 0000000..a6520f7
--- /dev/null
+++ b/src/set.cpp
@@ -0,0 +1,310 @@
+#include "pch.h"
+#include <dplyr/main.h>
+
+#include <boost/scoped_ptr.hpp>
+
+#include <tools/match.h>
+#include <tools/collapse.h>
+
+#include <dplyr/visitor_set/VisitorSetIndexSet.h>
+#include <dplyr/visitor_set/VisitorSetIndexMap.h>
+
+#include <dplyr/BoolResult.h>
+
+#include <dplyr/DataFrameSubsetVisitors.h>
+#include <dplyr/JoinVisitor.h>
+#include <dplyr/DataFrameJoinVisitors.h>
+
+#include <dplyr/train.h>
+
+using namespace Rcpp;
+using namespace dplyr;
+
+class RowTrack {
+public:
+ RowTrack(const std::string& msg, int max_count_ = 10) : ss(), count(0), max_count(max_count_) {
+ ss << msg;
+ }
+
+ void record(int i) {
+ if (count > max_count) return;
+ if (count) ss << ", ";
+ int idx = i >= 0 ? (i + 1) : -i;
+ ss << idx;
+ if (count == max_count) ss << "[...]";
+ count++;
+ }
+
+ bool empty() const {
+ return count == 0;
+ }
+
+ std::string str() const {
+ return ss.str();
+ }
+
+private:
+ std::stringstream ss;
+ int count;
+ int max_count;
+};
+
+// [[Rcpp::export]]
+dplyr::BoolResult compatible_data_frame_nonames(DataFrame x, DataFrame y, bool convert) {
+ int n = x.size();
+ if (n != y.size())
+ return no_because(tfm::format("different number of columns : %d x %d", n, y.size()));
+
+ if (convert) {
+ for (int i = 0; i < n; i++) {
+ try {
+ boost::scoped_ptr<JoinVisitor> v(
+ join_visitor(
+ Column(x[i], SymbolString("x")), Column(y[i], SymbolString("y")), true, true
+ )
+ );
+ } catch (...) {
+ return no_because("incompatible");
+ }
+ }
+ } else {
+ for (int i = 0; i < n; i++) {
+ SEXP xi = x[i], yi = y[i];
+ if (TYPEOF(xi) != TYPEOF(yi))
+ return no_because("incompatible types");
+
+ if (TYPEOF(xi) == INTSXP) {
+ if (Rf_inherits(xi, "factor") && Rf_inherits(yi, "factor")) {
+ if (same_levels(xi, yi)) continue;
+ return no_because("factors with different levels");
+ }
+
+ if (Rf_inherits(xi, "factor")) return no_because("cannot compare factor and integer");
+ if (Rf_inherits(yi, "factor")) return no_because("cannot compare factor and integer");
+
+ }
+ }
+ }
+
+ return yes();
+
+}
+
+// [[Rcpp::export]]
+dplyr::BoolResult compatible_data_frame(DataFrame x, DataFrame y, bool ignore_col_order = true, bool convert = false) {
+ int n = x.size();
+
+ bool null_x = Rf_isNull(x.names()), null_y = Rf_isNull(y.names());
+ if (null_x && !null_y) {
+ return no_because("x does not have names, but y does");
+ } else if (null_y && !null_x) {
+ return no_because("y does not have names, but x does");
+ } else if (null_x && null_y) {
+ return compatible_data_frame_nonames(x, y, convert);
+ }
+
+ CharacterVector names_x = x.names();
+ CharacterVector names_y = y.names();
+
+ CharacterVector names_y_not_in_x = setdiff(names_y, names_x);
+ CharacterVector names_x_not_in_y = setdiff(names_x, names_y);
+
+ if (!ignore_col_order) {
+ if (names_y_not_in_x.size() == 0 && names_x_not_in_y.size() == 0) {
+ // so the names are the same, check if they are in the same order
+ for (int i = 0; i < n; i++) {
+ if (names_x[i] != names_y[i]) {
+ return no_because("Same column names, but different order");
+ }
+ }
+ }
+ }
+
+ CharacterVector why;
+ if (names_y_not_in_x.size()) {
+ std::stringstream ss;
+ ss << "Cols in y but not x: " << collapse_utf8(names_y_not_in_x, ", ", "`") << ". ";
+ why.push_back(String(ss.str(), CE_UTF8));
+ }
+
+ if (names_x_not_in_y.size()) {
+ std::stringstream ss;
+ ss << "Cols in x but not y: " << collapse_utf8(names_x_not_in_y, ", ", "`") << ". ";
+ why.push_back(String(ss.str(), CE_UTF8));
+ }
+
+ if (why.length() > 0) return no_because(why);
+
+ IntegerVector orders = r_match(names_x, names_y);
+
+ for (int i = 0; i < n; i++) {
+ SymbolString name = names_x[i];
+ SEXP xi = x[i], yi = y[orders[i] - 1];
+ boost::scoped_ptr<SubsetVectorVisitor> vx(subset_visitor(xi, name));
+ boost::scoped_ptr<SubsetVectorVisitor> vy(subset_visitor(yi, name));
+
+ std::stringstream ss;
+ bool compatible = convert ?
+ vx->is_compatible(vy.get(), ss, name) :
+ vx->is_same_type(vy.get(), ss, name);
+
+ if (!compatible) {
+ if (ss.str() == "") {
+ ss << "Incompatible type for column `"
+ << name.get_utf8_cstring()
+ << "`: x " << vx->get_r_type()
+ << ", y " << vy->get_r_type();
+ }
+
+ why.push_back(String(ss.str(), CE_UTF8));
+ }
+
+ }
+
+ if (why.length() > 0) return no_because(why);
+ return yes();
+}
+
+// [[Rcpp::export]]
+dplyr::BoolResult equal_data_frame(DataFrame x, DataFrame y, bool ignore_col_order = true, bool ignore_row_order = true, bool convert = false) {
+ BoolResult compat = compatible_data_frame(x, y, ignore_col_order, convert);
+ if (!compat) return compat;
+
+ typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map;
+ DataFrameJoinVisitors visitors(x, y, x.names(), x.names(), true, true);
+ Map map(visitors);
+
+ // train the map in both x and y
+ int nrows_x = x.nrows();
+ int nrows_y = y.nrows();
+
+ if (nrows_x != nrows_y)
+ return no_because("Different number of rows");
+ if (x.size() == 0)
+ return yes();
+
+ for (int i = 0; i < nrows_x; i++) map[i].push_back(i);
+ for (int i = 0; i < nrows_y; i++) map[-i - 1].push_back(-i - 1);
+
+ RowTrack track_x("Rows in x but not y: ");
+ RowTrack track_y("Rows in y but not x: ");
+ RowTrack track_mismatch("Rows with difference occurences in x and y: ");
+
+ bool ok = true;
+ Map::const_iterator it = map.begin();
+
+ for (; it != map.end(); ++it) {
+ // retrieve the indices ( -ves for y, +ves for x )
+ const std::vector<int>& chunk = it->second;
+ int n = chunk.size();
+
+ int count_left = 0, count_right = 0;
+ for (int i = 0; i < n; i++) {
+ if (chunk[i] < 0)
+ count_right++;
+ else
+ count_left++;
+ }
+ if (count_right == 0) {
+ track_x.record(chunk[0]);
+ ok = false;
+ } else if (count_left == 0) {
+ track_y.record(chunk[0]);
+ ok = false;
+ } else if (count_left != count_right) {
+ track_mismatch.record(chunk[0]);
+ ok = false;
+ }
+
+ }
+
+ if (!ok) {
+ std::stringstream ss;
+ if (! track_x.empty()) ss << track_x.str() << ". ";
+ if (! track_y.empty()) ss << track_y.str() << ". ";
+ if (! track_mismatch.empty()) ss << track_mismatch.str();
+
+ return no_because(CharacterVector::create(String(ss.str(), CE_UTF8)));
+ }
+
+ if (ok && ignore_row_order) return yes();
+
+ if (!ignore_row_order) {
+ for (int i = 0; i < nrows_x; i++) {
+ if (!visitors.equal(i, -i - 1)) {
+ return no_because("Same row values, but different order");
+ }
+ }
+ }
+
+ return yes();
+}
+
+// [[Rcpp::export]]
+DataFrame union_data_frame(DataFrame x, DataFrame y) {
+ BoolResult compat = compatible_data_frame(x, y, true, true);
+ if (!compat) {
+ stop("not compatible: %s", compat.why_not());
+ }
+
+ typedef VisitorSetIndexSet<DataFrameJoinVisitors> Set;
+ DataFrameJoinVisitors visitors(x, y, x.names(), x.names(), true, true);
+ Set set(visitors);
+
+ train_insert(set, x.nrows());
+ train_insert_right(set, y.nrows());
+
+ return visitors.subset(set, get_class(x));
+}
+
+// [[Rcpp::export]]
+DataFrame intersect_data_frame(DataFrame x, DataFrame y) {
+ BoolResult compat = compatible_data_frame(x, y, true, true);
+ if (!compat) {
+ stop("not compatible: %s", compat.why_not());
+ }
+ typedef VisitorSetIndexSet<DataFrameJoinVisitors> Set;
+
+ DataFrameJoinVisitors visitors(x, y, x.names(), x.names(), true, true);
+ Set set(visitors);
+
+ train_insert(set, x.nrows());
+
+ std::vector<int> indices;
+ int n_y = y.nrows();
+ for (int i = 0; i < n_y; i++) {
+ Set::iterator it = set.find(-i - 1);
+ if (it != set.end()) {
+ indices.push_back(*it);
+ set.erase(it);
+ }
+ }
+
+ return visitors.subset(indices, get_class(x));
+}
+
+// [[Rcpp::export]]
+DataFrame setdiff_data_frame(DataFrame x, DataFrame y) {
+ BoolResult compat = compatible_data_frame(x, y, true, true);
+ if (!compat) {
+ stop("not compatible: %s", compat.why_not());
+ }
+
+ typedef VisitorSetIndexSet<DataFrameJoinVisitors> Set;
+ DataFrameJoinVisitors visitors(y, x, y.names(), y.names(), true, true);
+ Set set(visitors);
+
+ train_insert(set, y.nrows());
+
+ std::vector<int> indices;
+
+ int n_x = x.nrows();
+ for (int i = 0; i < n_x; i++) {
+ if (!set.count(-i - 1)) {
+ set.insert(-i - 1);
+ indices.push_back(-i - 1);
+ }
+ }
+
+ return visitors.subset(indices, get_class(x));
+}
diff --git a/src/slice.cpp b/src/slice.cpp
new file mode 100644
index 0000000..1342700
--- /dev/null
+++ b/src/slice.cpp
@@ -0,0 +1,195 @@
+#include "pch.h"
+#include <dplyr/main.h>
+
+#include <tools/Quosure.h>
+
+#include <dplyr/GroupedDataFrame.h>
+
+#include <dplyr/Result/GroupedCallProxy.h>
+#include <dplyr/Result/CallProxy.h>
+
+using namespace Rcpp;
+using namespace dplyr;
+
+inline SEXP check_filter_integer_result(SEXP tmp) {
+ if (TYPEOF(tmp) != INTSXP && TYPEOF(tmp) != REALSXP && TYPEOF(tmp) != LGLSXP) {
+ stop("slice condition does not evaluate to an integer or numeric vector. ");
+ }
+ return tmp;
+}
+
+class CountIndices {
+public:
+ CountIndices(int nr_, IntegerVector test_) : nr(nr_), test(test_), n_pos(0), n_neg(0) {
+
+ for (int j = 0; j < test.size(); j++) {
+ int i = test[j];
+ if (i > 0 && i <= nr) {
+ n_pos++;
+ } else if (i < 0 && i >= -nr) {
+ n_neg++;
+ }
+ }
+
+ if (n_neg > 0 && n_pos > 0) {
+ stop("Found %d positive indices and %d negative indices", n_pos, n_neg);
+ }
+
+ }
+
+ inline bool is_positive() const {
+ return n_pos > 0;
+ }
+ inline int get_n_positive() const {
+ return n_pos;
+ }
+ inline int get_n_negative() const {
+ return n_neg;
+ }
+
+private:
+ int nr;
+ IntegerVector test;
+ int n_pos;
+ int n_neg;
+};
+
+DataFrame slice_grouped(GroupedDataFrame gdf, const QuosureList& dots) {
+ typedef GroupedCallProxy<GroupedDataFrame, LazyGroupedSubsets> Proxy;
+
+ const DataFrame& data = gdf.data();
+ const NamedQuosure& quosure = dots[0];
+ Environment env = quosure.env();
+ SymbolVector names = data.names();
+
+ // we already checked that we have only one expression
+ Call call(quosure.expr());
+
+ std::vector<int> indx;
+ indx.reserve(1000);
+
+ IntegerVector g_test;
+ Proxy call_proxy(call, gdf, env);
+
+ int ngroups = gdf.ngroups();
+ GroupedDataFrame::group_iterator git = gdf.group_begin();
+ for (int i = 0; i < ngroups; i++, ++git) {
+ const SlicingIndex& indices = *git;
+ int nr = indices.size();
+ g_test = check_filter_integer_result(call_proxy.get(indices));
+ CountIndices counter(indices.size(), g_test);
+
+ if (counter.is_positive()) {
+ // positive indexing
+ int ntest = g_test.size();
+ for (int j = 0; j < ntest; j++) {
+ if (!(g_test[j] > nr || g_test[j] == NA_INTEGER)) {
+ indx.push_back(indices[g_test[j] - 1]);
+ }
+ }
+ } else if (counter.get_n_negative() != 0) {
+ // negative indexing
+ std::set<int> drop;
+ int n = g_test.size();
+ for (int j = 0; j < n; j++) {
+ if (g_test[j] != NA_INTEGER)
+ drop.insert(-g_test[j]);
+ }
+ int n_drop = drop.size();
+ std::set<int>::const_iterator drop_it = drop.begin();
+
+ int k = 0, j = 0;
+ while (drop_it != drop.end()) {
+ int next_drop = *drop_it - 1;
+ while (j < next_drop) {
+ indx.push_back(indices[j++]);
+ k++;
+ }
+ j++;
+ ++drop_it;
+ }
+ while (k < nr - n_drop) {
+ indx.push_back(indices[j++]);
+ k++;
+ }
+ }
+ }
+
+ DataFrame res = subset(data, indx, names, classes_grouped<GroupedDataFrame>());
+ set_vars(res, get_vars(data));
+ strip_index(res);
+
+ return GroupedDataFrame(res).data();
+}
+
+DataFrame slice_not_grouped(const DataFrame& df, const QuosureList& dots) {
+ CharacterVector names = df.names();
+
+ const NamedQuosure& quosure = dots[0];
+ Call call(quosure.expr());
+ CallProxy proxy(call, df, quosure.env());
+ int nr = df.nrows();
+
+ IntegerVector test = check_filter_integer_result(proxy.eval());
+
+ int n = test.size();
+
+ // count the positive and negatives
+ CountIndices counter(nr, test);
+
+ // just positives -> one based subset
+ if (counter.is_positive()) {
+ int n_pos = counter.get_n_positive();
+ std::vector<int> idx(n_pos);
+ int j = 0;
+ for (int i = 0; i < n_pos; i++) {
+ while (test[j] > nr || test[j] == NA_INTEGER) j++;
+ idx[i] = test[j++] - 1;
+ }
+
+ return subset(df, idx, df.names(), classes_not_grouped());
+ }
+
+ // special case where only NA
+ if (counter.get_n_negative() == 0) {
+ std::vector<int> indices;
+ return subset(df, indices, df.names(), classes_not_grouped());
+ }
+
+ // just negatives (out of range is dealt with early in CountIndices).
+ std::set<int> drop;
+ for (int i = 0; i < n; i++) {
+ if (test[i] != NA_INTEGER)
+ drop.insert(-test[i]);
+ }
+ std::vector<int> indices;
+ indices.reserve(nr);
+ std::set<int>::const_iterator drop_it = drop.begin();
+
+ int j = 0;
+ while (drop_it != drop.end()) {
+ int next_drop = *drop_it - 1;
+ for (; j < next_drop; ++j) {
+ indices.push_back(j);
+ }
+ j++;
+ ++drop_it;
+ }
+ for (; j < nr; ++j) {
+ indices.push_back(j);
+ }
+
+ return subset(df, indices, df.names(), classes_not_grouped());
+}
+
+// [[Rcpp::export]]
+SEXP slice_impl(DataFrame df, QuosureList dots) {
+ if (dots.size() == 0) return df;
+ if (dots.size() != 1)
+ stop("slice only accepts one expression");
+ if (is<GroupedDataFrame>(df)) {
+ return slice_grouped(GroupedDataFrame(df), dots);
+ } else {
+ return slice_not_grouped(df, dots);
+ }
+}
diff --git a/src/strings_addresses.cpp b/src/strings_addresses.cpp
deleted file mode 100644
index 79caef4..0000000
--- a/src/strings_addresses.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-#include <Rcpp.h>
-using namespace Rcpp ;
-
-// [[Rcpp::export]]
-CharacterVector strings_addresses(CharacterVector s){
- static char buffer[20] ;
- int n = s.size() ;
-
- CharacterVector res(n) ;
- for( int i=0; i<n; i++){
- SEXP x = s[i] ;
- snprintf( buffer, 20, "%p", reinterpret_cast<void*>(x) ) ;
- res[i] = buffer ;
- }
- res.names() = s ;
-
- return res ;
-}
-
diff --git a/src/summarise.cpp b/src/summarise.cpp
index 6a48ba9..c2abcc1 100644
--- a/src/summarise.cpp
+++ b/src/summarise.cpp
@@ -1,115 +1,169 @@
-#include <dplyr.h>
+#include "pch.h"
+#include <dplyr/main.h>
-using namespace Rcpp ;
-using namespace dplyr ;
+#include <boost/scoped_ptr.hpp>
-template <typename Data, typename Subsets>
-SEXP summarise_grouped(const DataFrame& df, const LazyDots& dots){
- Data gdf(df) ;
-
- int nexpr = dots.size() ;
- int nvars = gdf.nvars() ;
- check_not_groups(dots, gdf);
- NamedListAccumulator<Data> accumulator ;
-
- int i=0;
- List results(nvars + nexpr) ;
- for( ; i<nvars; i++){
- results[i] = shared_SEXP(gdf.label(i)) ;
- accumulator.set( PRINTNAME(gdf.symbol(i)), results[i] ) ;
- }
+#include <tools/Quosure.h>
- Subsets subsets(gdf) ;
- for( int k=0; k<nexpr; k++, i++ ){
- Rcpp::checkUserInterrupt() ;
- const Lazy& lazy = dots[k] ;
- const Environment& env = lazy.env() ;
-
- Shield<SEXP> expr_(lazy.expr()) ; SEXP expr = expr_ ;
- boost::scoped_ptr<Result> res( get_handler( expr, subsets, env ) );
-
- // if we could not find a direct Result
- // we can use a GroupedCallReducer which will callback to R
- if( !res ) {
- res.reset( new GroupedCallReducer<Data, Subsets>( lazy.expr(), subsets, env) );
- }
- RObject result = res->process(gdf) ;
- results[i] = result ;
- accumulator.set( lazy.name(), result );
- subsets.input( lazy.name(), SummarisedVariable(result) ) ;
+#include <dplyr/GroupedDataFrame.h>
- }
+#include <dplyr/Result/LazyRowwiseSubsets.h>
+#include <dplyr/Result/GroupedCallReducer.h>
+#include <dplyr/Result/CallProxy.h>
+
+#include <dplyr/Gatherer.h>
+#include <dplyr/NamedListAccumulator.h>
+#include <dplyr/Groups.h>
+
+using namespace Rcpp;
+using namespace dplyr;
- List out = accumulator ;
- copy_most_attributes( out, df) ;
- out.names() = accumulator.names() ;
+static
+SEXP validate_unquoted_value(SEXP value, int nrows, const SymbolString& name) {
+ int n = Rf_length(value);
+ check_length(n, nrows, "the number of groups", name);
- int nr = gdf.ngroups() ;
- set_rownames(out, nr ) ;
+ // Recycle length 1 vectors
+ if (n == 1) {
+ boost::scoped_ptr<Gatherer> gather(constant_gatherer(value, nrows, name));
+ value = gather->collect();
+ }
- if( gdf.nvars() > 1){
- out.attr( "class" ) = classes_grouped<Data>() ;
- List vars = gdf.data().attr("vars") ;
- vars.erase( gdf.nvars() - 1) ;
- out.attr( "vars") = vars ;
- out.attr( "labels") = R_NilValue ;
- out.attr( "indices") = R_NilValue ;
- out.attr( "group_sizes") = R_NilValue ;
- out.attr( "biggest_group_size") = R_NilValue ;
+ return value;
+}
- out.attr( "drop" ) = true ;
+template <typename Data, typename Subsets>
+DataFrame summarise_grouped(const DataFrame& df, const QuosureList& dots) {
+ Data gdf(df);
+
+ int nexpr = dots.size();
+ int nvars = gdf.nvars();
+ check_not_groups(dots, gdf);
+
+ LOG_VERBOSE << "copying " << nvars << " variables to accumulator";
+
+ NamedListAccumulator<Data> accumulator;
+ int i = 0;
+ List results(nvars + nexpr);
+ for (; i < nvars; i++) {
+ LOG_VERBOSE << "copying " << gdf.symbol(i).get_utf8_cstring();
+ results[i] = shared_SEXP(gdf.label(i));
+ accumulator.set(gdf.symbol(i), results[i]);
+ }
+
+ LOG_VERBOSE << "processing " << nexpr << " variables";
+
+ Subsets subsets(gdf);
+ for (int k = 0; k < nexpr; k++, i++) {
+ LOG_VERBOSE << "processing variable " << k;
+ Rcpp::checkUserInterrupt();
+ const NamedQuosure& quosure = dots[k];
+ const Environment& env = quosure.env();
+
+ LOG_VERBOSE << "processing variable " << quosure.name().get_utf8_cstring();
+
+ Shield<SEXP> expr_(quosure.expr());
+ SEXP expr = expr_;
+ RObject result;
+
+ // Unquoted vectors are directly used as column. Expressions are
+ // evaluated in each group.
+ if (is_vector(expr)) {
+ result = validate_unquoted_value(expr, gdf.ngroups(), quosure.name());
} else {
- out.attr( "class" ) = classes_not_grouped() ;
- SET_ATTRIB( out, strip_group_attributes(out) ) ;
+ boost::scoped_ptr<Result> res(get_handler(expr, subsets, env));
+
+ // If we could not find a direct Result,
+ // we can use a GroupedCallReducer which will callback to R.
+ // Note that the GroupedCallReducer currently doesn't apply
+ // special treatment to summary variables, for which hybrid
+ // evaluation should be turned off completely (#2312)
+ if (!res) {
+ res.reset(new GroupedCallReducer<Data, Subsets>(quosure.expr(), subsets, env, quosure.name()));
+ }
+ result = res->process(gdf);
}
- return out ;
+ results[i] = result;
+ accumulator.set(quosure.name(), result);
+ subsets.input_summarised(quosure.name(), SummarisedVariable(result));
+ }
+
+ List out = accumulator;
+ copy_most_attributes(out, df);
+ out.names() = accumulator.names();
+
+ int nr = gdf.ngroups();
+ set_rownames(out, nr);
+
+ if (gdf.nvars() > 1) {
+ set_class(out, classes_grouped<Data>());
+ SymbolVector vars = get_vars(gdf.data(), true);
+ vars.remove(gdf.nvars() - 1);
+ set_vars(out, vars);
+ out.attr("drop") = true;
+
+ strip_index(out);
+ } else {
+ set_class(out, classes_not_grouped());
+ SET_ATTRIB(out, strip_group_attributes(out));
+ }
+
+ return out;
}
-SEXP summarise_not_grouped(DataFrame df, const LazyDots& dots){
- int nexpr = dots.size() ;
- if( nexpr == 0) return DataFrame() ;
-
- LazySubsets subsets( df ) ;
- NamedListAccumulator<DataFrame> accumulator ;
- List results(nexpr) ;
-
- for( int i=0; i<nexpr; i++){
- Rcpp::checkUserInterrupt() ;
-
- const Lazy& lazy = dots[i] ;
- Environment env = lazy.env() ;
- Shield<SEXP> expr_(lazy.expr()) ; SEXP expr = expr_ ;
- boost::scoped_ptr<Result> res( get_handler( expr, subsets, env ) ) ;
- SEXP result ;
- if(res) {
- result = results[i] = res->process( FullDataFrame(df) ) ;
- } else {
- result = results[i] = CallProxy( lazy.expr(), subsets, env).eval() ;
- }
- if( Rf_length(result) != 1 ){
- stop( "expecting result of length one, got : %d", Rf_length(result) ) ;
- }
- accumulator.set(lazy.name(), result );
- subsets.input( lazy.name(), result ) ;
+DataFrame summarise_not_grouped(DataFrame df, const QuosureList& dots) {
+ int nexpr = dots.size();
+ if (nexpr == 0) return DataFrame();
+
+ LazySubsets subsets(df);
+ NamedListAccumulator<DataFrame> accumulator;
+ List results(nexpr);
+
+ for (int i = 0; i < nexpr; i++) {
+ Rcpp::checkUserInterrupt();
+
+ const NamedQuosure& quosure = dots[i];
+ Environment env = quosure.env();
+ Shield<SEXP> expr_(quosure.expr());
+ SEXP expr = expr_;
+ SEXP result;
+
+ // Unquoted vectors are directly used as column. Expressions are
+ // evaluated in each group.
+ if (is_vector(expr)) {
+ result = validate_unquoted_value(expr, 1, quosure.name());
+ } else {
+ boost::scoped_ptr<Result> res(get_handler(expr, subsets, env));
+ if (res) {
+ result = results[i] = res->process(FullDataFrame(df));
+ } else {
+ result = results[i] = CallProxy(quosure.expr(), subsets, env).eval();
+ }
+ check_supported_type(result, quosure.name());
+ check_length(Rf_length(result), 1, "a summary value", quosure.name());
}
- List data = accumulator ;
- copy_most_attributes(data, df) ;
- data.names() = accumulator.names() ;
- set_rownames(data, 1 ) ;
- return data ;
+ accumulator.set(quosure.name(), result);
+ subsets.input_summarised(quosure.name(), SummarisedVariable(result));
+ }
+
+ List data = accumulator;
+ copy_most_attributes(data, df);
+ data.names() = accumulator.names();
+ set_rownames(data, 1);
+ return data;
}
// [[Rcpp::export]]
-SEXP summarise_impl( DataFrame df, LazyDots dots){
- if( df.size() == 0 ) return df ;
- check_valid_colnames(df) ;
- if( is<RowwiseDataFrame>(df) ){
- return summarise_grouped<RowwiseDataFrame, LazyRowwiseSubsets>( df, dots);
- } else if( is<GroupedDataFrame>( df ) ){
- return summarise_grouped<GroupedDataFrame, LazyGroupedSubsets>( df, dots);
- } else {
- return summarise_not_grouped( df, dots) ;
- }
+SEXP summarise_impl(DataFrame df, QuosureList dots) {
+ if (df.size() == 0) return df;
+ check_valid_colnames(df);
+ if (is<RowwiseDataFrame>(df)) {
+ return summarise_grouped<RowwiseDataFrame, LazyRowwiseSubsets>(df, dots);
+ } else if (is<GroupedDataFrame>(df)) {
+ return summarise_grouped<GroupedDataFrame, LazyGroupedSubsets>(df, dots);
+ } else {
+ return summarise_not_grouped(df, dots);
+ }
}
diff --git a/src/test.cpp b/src/test.cpp
index de7ad80..4835849 100644
--- a/src/test.cpp
+++ b/src/test.cpp
@@ -1,20 +1,128 @@
-#include <dplyr.h>
+#include "pch.h"
+#include <dplyr/main.h>
-using namespace Rcpp ;
-using namespace dplyr ;
+#include <dplyr/comparisons.h>
+#include <dplyr/join_match.h>
+
+using namespace Rcpp;
+using namespace dplyr;
// [[Rcpp::export]]
-LogicalVector test_comparisons(){
- dplyr::comparisons<REALSXP> comp ;
- return LogicalVector::create(
- comp.is_less( 1.0, 2.0 ),
- !comp.is_less( 2.0, 1.0 ),
- comp.is_less( NA_REAL, R_NaN ),
- ! comp.is_less( R_NaN, NA_REAL),
- ! comp.is_less( NA_REAL, 1.0 ),
- ! comp.is_less( R_NaN, 1.0 ),
- comp.is_less( 1.0, NA_REAL ),
- comp.is_less( 1.0, R_NaN )
- ) ;
+LogicalVector test_comparisons() {
+ typedef dplyr::comparisons<REALSXP> comp;
+ return LogicalVector::create(comp::is_less(1.0, 2.0),
+ !comp::is_less(2.0, 1.0),
+ comp::is_less(NA_REAL, R_NaN),
+ !comp::is_less(R_NaN, NA_REAL),
+ !comp::is_less(NA_REAL, 1.0),
+ !comp::is_less(R_NaN, 1.0),
+ comp::is_less(1.0, NA_REAL),
+ comp::is_less(1.0, R_NaN)
+ );
}
+// [[Rcpp::export]]
+List test_matches() {
+ typedef dplyr::join_match<INTSXP, INTSXP, true> int_int_na;
+ typedef dplyr::join_match<REALSXP, REALSXP, true> real_real_na;
+ typedef dplyr::join_match<INTSXP, REALSXP, true> int_real_na;
+ typedef dplyr::join_match<REALSXP, INTSXP, true> real_int_na;
+ typedef dplyr::join_match<INTSXP, INTSXP, false> int_int;
+ typedef dplyr::join_match<REALSXP, REALSXP, false> real_real;
+ typedef dplyr::join_match<INTSXP, REALSXP, false> int_real;
+ typedef dplyr::join_match<REALSXP, INTSXP, false> real_int;
+ return
+ List::create(
+ LogicalVector::create(
+ int_int_na::is_match(1, 1),
+ !int_int_na::is_match(1, 2),
+ !int_int_na::is_match(1, NA_INTEGER),
+ !int_int_na::is_match(NA_INTEGER, 1),
+ int_int_na::is_match(NA_INTEGER, NA_INTEGER),
+ int_int::is_match(1, 1),
+ !int_int::is_match(1, 2),
+ !int_int::is_match(1, NA_INTEGER),
+ !int_int::is_match(NA_INTEGER, 1),
+ !int_int::is_match(NA_INTEGER, NA_INTEGER)
+ ),
+
+ LogicalVector::create(
+ real_real_na::is_match(1, 1),
+ !real_real_na::is_match(1, 2),
+ !real_real_na::is_match(1, NA_REAL),
+ !real_real_na::is_match(NA_REAL, 1),
+ !real_real_na::is_match(1, R_NaN),
+ !real_real_na::is_match(R_NaN, 1),
+ !real_real_na::is_match(R_NaN, NA_REAL),
+ !real_real_na::is_match(NA_REAL, R_NaN),
+ real_real_na::is_match(NA_REAL, NA_REAL),
+ real_real_na::is_match(R_NaN, R_NaN),
+ real_real::is_match(1, 1),
+ !real_real::is_match(1, 2),
+ !real_real::is_match(1, NA_REAL),
+ !real_real::is_match(NA_REAL, 1),
+ !real_real::is_match(1, R_NaN),
+ !real_real::is_match(R_NaN, 1),
+ !real_real::is_match(R_NaN, NA_REAL),
+ !real_real::is_match(NA_REAL, R_NaN),
+ !real_real::is_match(NA_REAL, NA_REAL),
+ !real_real::is_match(R_NaN, R_NaN)
+ ),
+
+ LogicalVector::create(
+ int_real_na::is_match(1, 1),
+ !int_real_na::is_match(1, 2),
+ !int_real_na::is_match(1, NA_REAL),
+ !int_real_na::is_match(NA_INTEGER, 1),
+ !int_real_na::is_match(1, R_NaN),
+ !int_real_na::is_match(NA_INTEGER, R_NaN),
+ int_real_na::is_match(NA_INTEGER, NA_REAL),
+ int_real::is_match(1, 1),
+ !int_real::is_match(1, 2),
+ !int_real::is_match(1, NA_REAL),
+ !int_real::is_match(NA_INTEGER, 1),
+ !int_real::is_match(1, R_NaN),
+ !int_real::is_match(NA_INTEGER, R_NaN),
+ !int_real::is_match(NA_INTEGER, NA_REAL)
+ ),
+
+ LogicalVector::create(
+ real_int_na::is_match(1, 1),
+ !real_int_na::is_match(1, 2),
+ !real_int_na::is_match(1, NA_INTEGER),
+ !real_int_na::is_match(NA_REAL, 1),
+ !real_int_na::is_match(R_NaN, 1),
+ !real_int_na::is_match(R_NaN, NA_INTEGER),
+ real_int_na::is_match(NA_REAL, NA_INTEGER),
+ real_int::is_match(1, 1),
+ !real_int::is_match(1, 2),
+ !real_int::is_match(1, NA_INTEGER),
+ !real_int::is_match(NA_REAL, 1),
+ !real_int::is_match(R_NaN, 1),
+ !real_int::is_match(R_NaN, NA_INTEGER),
+ !real_int::is_match(NA_REAL, NA_INTEGER)
+ )
+ );
+}
+
+// [[Rcpp::export]]
+LogicalVector test_length_wrap() {
+ R_xlen_t small = R_LEN_T_MAX / 2;
+
+#ifdef LONG_VECTOR_SUPPORT
+ R_xlen_t large = (R_xlen_t)(R_LEN_T_MAX * 2.0);
+ R_xlen_t missing = NA_INTEGER;
+
+ return
+ LogicalVector::create(
+ as<double>(wrap(small)) == (double)small,
+ as<double>(wrap(large)) == (double)large,
+ as<double>(wrap(missing)) == (double)missing
+ );
+#else
+ return
+ LogicalVector::create(
+ as<double>(wrap(small)) == (double)small
+ );
+#endif
+}
diff --git a/src/utils.cpp b/src/utils.cpp
new file mode 100644
index 0000000..e1780cb
--- /dev/null
+++ b/src/utils.cpp
@@ -0,0 +1,290 @@
+#include "pch.h"
+#include <dplyr/main.h>
+
+#include <tools/utils.h>
+#include <dplyr/white_list.h>
+#include <tools/collapse.h>
+#include <dplyr/bad.h>
+
+using namespace Rcpp;
+
+// [[Rcpp::export]]
+void assert_all_white_list(const DataFrame& data) {
+ // checking variables are on the white list
+ int nc = data.size();
+ for (int i = 0; i < nc; i++) {
+ if (!white_list(data[i])) {
+ SymbolVector names = data.names();
+ const SymbolString& name_i = names[i];
+ SEXP v = data[i];
+
+ SEXP klass = Rf_getAttrib(v, R_ClassSymbol);
+ if (!Rf_isNull(klass)) {
+ bad_col(name_i, "is of unsupported class {type}",
+ _["type"] = get_single_class(v));
+ }
+ else {
+ bad_col(name_i, "is of unsupported type {type}", _["type"] = Rf_type2char(TYPEOF(v)));
+ }
+ }
+ }
+}
+
+SEXP shared_SEXP(SEXP x) {
+ SET_NAMED(x, 2);
+ return x;
+}
+
+// [[Rcpp::export]]
+SEXP shallow_copy(const List& data) {
+ int n = data.size();
+ List out(n);
+ for (int i = 0; i < n; i++) {
+ out[i] = shared_SEXP(data[i]);
+ }
+ copy_attributes(out, data);
+ return out;
+}
+
+SEXP pairlist_shallow_copy(SEXP p) {
+ Shield<SEXP> attr(Rf_cons(CAR(p), R_NilValue));
+ SEXP q = attr;
+ SET_TAG(q, TAG(p));
+ p = CDR(p);
+ while (!Rf_isNull(p)) {
+ Shield<SEXP> s(Rf_cons(CAR(p), R_NilValue));
+ SETCDR(q, s);
+ q = CDR(q);
+ SET_TAG(q, TAG(p));
+ p = CDR(p);
+ }
+ return attr;
+}
+
+void copy_only_attributes(SEXP out, SEXP data) {
+ SEXP att = ATTRIB(data);
+ const bool has_attributes = !Rf_isNull(att);
+ if (has_attributes) {
+ LOG_VERBOSE << "copying attributes: " << CharacterVector(List(att).names());
+
+ SET_ATTRIB(out, pairlist_shallow_copy(ATTRIB(data)));
+ }
+}
+
+void copy_attributes(SEXP out, SEXP data) {
+ copy_only_attributes(out, data);
+ SET_OBJECT(out, OBJECT(data));
+ if (IS_S4_OBJECT(data)) SET_S4_OBJECT(out);
+}
+
+SEXP null_if_empty(SEXP x) {
+ if (Rf_length(x))
+ return x;
+ else
+ return R_NilValue;
+}
+
+
+namespace dplyr {
+
+std::string get_single_class(SEXP x) {
+ SEXP klass = Rf_getAttrib(x, R_ClassSymbol);
+ if (!Rf_isNull(klass)) {
+ CharacterVector classes(klass);
+ return collapse_utf8(classes, "/");
+ }
+
+ if (Rf_isMatrix(x)) {
+ return "matrix";
+ }
+
+ switch (TYPEOF(x)) {
+ case INTSXP:
+ return "integer";
+ case REALSXP :
+ return "numeric";
+ case LGLSXP:
+ return "logical";
+ case STRSXP:
+ return "character";
+
+ case VECSXP:
+ return "list";
+ default:
+ break;
+ }
+
+ // just call R to deal with other cases
+ // we could call R_data_class directly but we might get a "this is not part of the api"
+ klass = Rf_eval(Rf_lang2(Rf_install("class"), x), R_GlobalEnv);
+ return CHAR(STRING_ELT(klass, 0));
+}
+
+CharacterVector default_chars(SEXP x, R_xlen_t len) {
+ if (Rf_isNull(x)) return CharacterVector(len);
+ return x;
+}
+
+CharacterVector get_class(SEXP x) {
+ SEXP class_attr = Rf_getAttrib(x, R_ClassSymbol);
+ return default_chars(class_attr, 0);
+}
+
+SEXP set_class(SEXP x, const CharacterVector& class_) {
+ SEXP class_attr = class_.length() == 0 ? R_NilValue : (SEXP)class_;
+ return Rf_setAttrib(x, R_ClassSymbol, class_attr);
+}
+
+CharacterVector get_levels(SEXP x) {
+ SEXP levels_attr = Rf_getAttrib(x, R_LevelsSymbol);
+ return default_chars(levels_attr, 0);
+}
+
+SEXP set_levels(SEXP x, const CharacterVector& levels) {
+ return Rf_setAttrib(x, R_LevelsSymbol, levels);
+}
+
+bool same_levels(SEXP left, SEXP right) {
+ return character_vector_equal(get_levels(left), get_levels(right));
+}
+
+SEXP list_as_chr(SEXP x) {
+ int n = Rf_length(x);
+ CharacterVector chr(n);
+
+ for (int i = 0; i != n; ++i) {
+ SEXP elt = VECTOR_ELT(x, i);
+ switch (TYPEOF(elt)) {
+ case STRSXP:
+ if (Rf_length(chr) == 1) {
+ chr[i] = elt;
+ continue;
+ }
+ break;
+ case SYMSXP:
+ chr[i] = PRINTNAME(elt);
+ continue;
+ default:
+ break;
+ }
+
+ stop("The tibble's `vars` attribute has unexpected contents");
+ }
+
+ return chr;
+}
+
+SymbolVector get_vars(SEXP x, bool duplicate) {
+ static SEXP vars_symbol = Rf_install("vars");
+ RObject vars = Rf_getAttrib(x, vars_symbol);
+ if (duplicate && MAYBE_SHARED(vars)) vars = Rf_duplicate(vars);
+
+ switch (TYPEOF(vars)) {
+ case NILSXP:
+ case STRSXP:
+ break;
+ case VECSXP:
+ vars = list_as_chr(vars);
+ break;
+ default:
+ stop("The tibble's `vars` attribute has unexpected type");
+ }
+
+ return SymbolVector(vars);
+}
+
+void set_vars(SEXP x, const SymbolVector& vars) {
+ static SEXP vars_symbol = Rf_install("vars");
+ Rf_setAttrib(x, vars_symbol, null_if_empty(vars.get_vector()));
+}
+
+void copy_vars(SEXP target, SEXP source) {
+ set_vars(target, get_vars(source));
+}
+
+bool character_vector_equal(const CharacterVector& x, const CharacterVector& y) {
+ if ((SEXP)x == (SEXP)y) return true;
+
+ if (x.length() != y.length())
+ return false;
+
+ for (R_xlen_t i = 0; i < x.length(); ++i) {
+ SEXP xi = x[i];
+ SEXP yi = y[i];
+
+ // Ideally we'd use Rf_Seql(), but this is not exported.
+ if (Rf_NonNullStringMatch(xi, yi)) continue;
+ if (xi == NA_STRING && yi == NA_STRING) continue;
+ if (xi == NA_STRING || yi == NA_STRING)
+ return false;
+ if (CHAR(xi)[0] == 0 && CHAR(yi)[0] == 0) continue;
+ return false;
+ }
+
+ return true;
+}
+
+}
+
+bool is_vector(SEXP x) {
+ switch (TYPEOF(x)) {
+ case LGLSXP:
+ case INTSXP:
+ case REALSXP:
+ case CPLXSXP:
+ case STRSXP:
+ case RAWSXP:
+ case VECSXP:
+ return true;
+ default:
+ return false;
+ }
+}
+bool is_atomic(SEXP x) {
+ switch (TYPEOF(x)) {
+ case LGLSXP:
+ case INTSXP:
+ case REALSXP:
+ case CPLXSXP:
+ case STRSXP:
+ case RAWSXP:
+ return true;
+ default:
+ return false;
+ }
+}
+
+SEXP vec_names(SEXP x) {
+ return Rf_getAttrib(x, R_NamesSymbol);
+}
+bool is_str_empty(SEXP str) {
+ const char* c_str = CHAR(str);
+ return strcmp(c_str, "") == 0;
+}
+bool has_name_at(SEXP x, R_len_t i) {
+ SEXP nms = vec_names(x);
+ return TYPEOF(nms) == STRSXP && !is_str_empty(STRING_ELT(nms, i));
+}
+SEXP name_at(SEXP x, size_t i) {
+ SEXP names = vec_names(x);
+ if (Rf_isNull(names))
+ return Rf_mkChar("");
+ else
+ return STRING_ELT(names, i);
+}
+
+SEXP f_env(SEXP x) {
+ return Rf_getAttrib(x, Rf_install(".Environment"));
+}
+bool is_quosure(SEXP x) {
+ return TYPEOF(x) == LANGSXP
+ && Rf_length(x) == 2
+ && Rf_inherits(x, "quosure")
+ && TYPEOF(f_env(x)) == ENVSXP;
+}
+SEXP maybe_rhs(SEXP x) {
+ if (is_quosure(x))
+ return CADR(x);
+ else
+ return x;
+}
diff --git a/src/window.cpp b/src/window.cpp
index 37750c4..d437b34 100644
--- a/src/window.cpp
+++ b/src/window.cpp
@@ -1,14 +1,14 @@
-#include <Rcpp.h>
-using namespace Rcpp;
+#include "pch.h"
+#include <dplyr/main.h>
//' Cumulativate versions of any, all, and mean
//'
-//' dplyr adds \code{cumall}, \code{cumany}, and \code{cummean} to complete
+//' dplyr adds `cumall()`, `cumany()`, and `cummean()` to complete
//' R's set of cumulate functions to match the aggregation functions available
//' in most databases
//'
-//' @param x For \code{cumall} & \code{cumany}, a logical vector; for
-//' \code{cummean} an integer or numeric vector
+//' @param x For `cumall()` and `cumany()`, a logical vector; for
+//' `cummean()` an integer or numeric vector
//' @export
// [[Rcpp::export]]
LogicalVector cumall(LogicalVector x) {
@@ -16,17 +16,17 @@ LogicalVector cumall(LogicalVector x) {
LogicalVector out(n, NA_LOGICAL);
int current = out[0] = x[0];
- if( current == NA_LOGICAL) return out ;
- if( current == FALSE){
- std::fill( out.begin(), out.end(), FALSE ) ;
- return out ;
+ if (current == NA_LOGICAL) return out;
+ if (current == FALSE) {
+ std::fill(out.begin(), out.end(), FALSE);
+ return out;
}
for (int i = 1; i < n; i++) {
- current = x[i] ;
- if( current == NA_LOGICAL ) break ;
- if( current == FALSE ){
- std::fill( out.begin() + i, out.end(), FALSE ) ;
- break ;
+ current = x[i];
+ if (current == NA_LOGICAL) break;
+ if (current == FALSE) {
+ std::fill(out.begin() + i, out.end(), FALSE);
+ break;
}
out[i] = current && out[i - 1];
}
@@ -41,17 +41,17 @@ LogicalVector cumany(LogicalVector x) {
LogicalVector out(n, NA_LOGICAL);
int current = out[0] = x[0];
- if( current == NA_LOGICAL ) return out ;
- if( current == TRUE ){
- std::fill( out.begin(), out.end(), TRUE ) ;
- return out ;
+ if (current == NA_LOGICAL) return out;
+ if (current == TRUE) {
+ std::fill(out.begin(), out.end(), TRUE);
+ return out;
}
for (int i = 1; i < n; i++) {
- current = x[i] ;
- if( current == NA_LOGICAL ) break ;
- if( current == TRUE ){
- std::fill( out.begin() + i, out.end(), TRUE ) ;
- break ;
+ current = x[i];
+ if (current == NA_LOGICAL) break;
+ if (current == TRUE) {
+ std::fill(out.begin() + i, out.end(), TRUE);
+ break;
}
out[i] = current || out[i - 1];
}
@@ -67,7 +67,7 @@ NumericVector cummean(NumericVector x) {
NumericVector out = no_init(n);
double sum = out[0] = x[0];
- for (int i = 1; i < n; i++ ) {
+ for (int i = 1; i < n; i++) {
sum += x[i];
out[i] = sum / (i + 1.0);
}
diff --git a/tests/testthat/helper-astyle.R b/tests/testthat/helper-astyle.R
new file mode 100644
index 0000000..57cd05f
--- /dev/null
+++ b/tests/testthat/helper-astyle.R
@@ -0,0 +1,30 @@
+vcapply <- function(X, FUN, ..., USE.NAMES = TRUE) {
+ vapply(X = X, FUN = FUN, FUN.VALUE = character(1L), ..., USE.NAMES = USE.NAMES)
+}
+
+astyle <- function(extra_args = character()) {
+ astyle_cmd <- "astyle"
+ if (Sys.which(astyle_cmd) == "") {
+ skip("astyle not found")
+ }
+
+ astyle_args <- c(
+ "-n",
+ "--indent=spaces=2",
+ "--unpad-paren",
+ "--pad-header",
+ "--pad-oper",
+ "--min-conditional-indent=0",
+ "--align-pointer=type",
+ "--align-reference=type"
+ )
+
+ src_path <- normalizePath(map_chr(c("../../src", "../../inst/include"), testthat::test_path))
+ src_files <- dir(src_path, "[.](?:cpp|h)$", recursive = TRUE, full.names = TRUE)
+ astyle_files <- grep("(?:RcppExports[.](?:cpp|h)|static_assert[.]h)", src_files, value = TRUE, invert = TRUE)
+ output <- system2(astyle_cmd, c(astyle_args, astyle_files, extra_args), stdout = TRUE, stderr = TRUE)
+ unchanged <- grepl("^Unchanged", output)
+ if (any(!unchanged)) {
+ rlang::warn(paste(output[!unchanged], collapse = "\n"))
+ }
+}
diff --git a/tests/testthat/helper-combine.R b/tests/testthat/helper-combine.R
new file mode 100644
index 0000000..ebcba96
--- /dev/null
+++ b/tests/testthat/helper-combine.R
@@ -0,0 +1,270 @@
+combine_pair_test <- function(item_pair, var1, var2, result,
+ can_combine = TRUE, warning = FALSE) {
+ label_if_fail <- paste0(
+ "combine(items[c(\"", var1, "\", \"", var2, "\")])"
+ )
+
+ if (warning) {
+ warning_regexp <- ".*"
+ } else {
+ warning_regexp <- NA
+ }
+
+ if (can_combine) {
+ expect_warning(
+ res <- combine(item_pair),
+ regexp = warning_regexp,
+ label = label_if_fail
+ )
+ expect_equal(
+ object = res,
+ expected = result,
+ label = label_if_fail,
+ expected.label = deparse(result)
+ )
+ } else {
+ expect_warning(
+ expect_error(
+ combine(item_pair),
+ "^Argument 2 can't be converted from [^ ]* to [^ ]*$",
+ label = label_if_fail
+ ),
+ regexp = warning_regexp,
+ label = label_if_fail
+ )
+ }
+}
+
+can_be_combined <- function(item1, item2,
+ class1, class2,
+ all_na1, all_na2,
+ known_to_dplyr1, known_to_dplyr2) {
+
+ # Unknown classes will be stripped and ignored (#2406)
+ if (!known_to_dplyr1) {
+ class1 <- class(as.vector(item1))
+ }
+ if (!known_to_dplyr2) {
+ class2 <- class(as.vector(item2))
+ }
+
+ # Two elements of the same class can be combined
+ # NA values are also combinable
+ if (identical(class1, class2) || all_na1 || all_na2) {
+ return(TRUE)
+ }
+
+ # doubles and integers:
+ if (all(c(class1, class2) %in% c("numeric", "integer"))) {
+ return(TRUE)
+ }
+
+ # coerce factor with character
+ if ((class1 == "factor" && class2 == "character") ||
+ (class2 == "factor" && class1 == "character")) {
+ return(TRUE)
+ }
+
+ # All the other cases can't be combined
+ return(FALSE)
+}
+
+give_a_warning <- function(item1, item2,
+ class1, class2,
+ known_to_dplyr1, known_to_dplyr2,
+ can_be_combined) {
+ # Unknown classes give a warning, because attributes may be wrong
+ if (!known_to_dplyr1) {
+ return(TRUE)
+ }
+
+ # If only the second element is of an unknown type to dplyr
+ # Then the warning is only emmitted in case we can combine (otherwise the
+ # error appears before)
+ if (!known_to_dplyr2 && can_be_combined) {
+ return(TRUE)
+ }
+
+ # factor and character give a warning when combined (coercion to character)
+ if ((class1 == "factor" && class2 == "character") ||
+ (class1 == "character" && class2 == "factor")) {
+ return(TRUE)
+ }
+
+ # Two factors give a warning if they don't have identical levels (coercion to character)
+ if (class1 == "factor" && class2 == "factor") {
+ if (!identical(levels(item1), levels(item2))) {
+ return(TRUE)
+ }
+ }
+ # All other cases do not raise a warning
+ return(FALSE)
+}
+
+combine_result <- function(item1, item2,
+ class1, class2,
+ all_na1, all_na2,
+ known_to_dplyr1, known_to_dplyr2,
+ can_combine, give_warning) {
+ result <- NULL
+
+ # Unknown classes will be stripped and ignored (#2406)
+ if (!known_to_dplyr1) {
+ class1 <- class(as.vector(item1))
+ }
+ if (!known_to_dplyr2) {
+ class2 <- class(as.vector(item2))
+ }
+
+ if (can_combine) {
+ # Custom coercions:
+ # - Factor with character coerced to character
+ # - Factor with Factor without same levels -> character
+ # - Factor with NA is Factor
+ # Otherwise use the default approach with unlist and add classes
+ # if needed.
+ if ((class1 == "factor" && class2 == "character") ||
+ (class2 == "factor" && class1 == "character")) {
+ result <- c(as.character(item1), as.character(item2))
+ } else if ((class1 == "factor" && class2 == "factor") &&
+ !identical(levels(item1), levels(item2))) {
+ result <- c(as.character(item1), as.character(item2))
+ } else if ((is.factor(item1) && all(is.na(item2))) ||
+ (is.factor(item2) && all(is.na(item1)))) {
+ result <- factor(c(as.character(item1), as.character(item2)))
+ } else {
+ # Default combination result
+ result <- unlist(
+ list(item1, item2),
+ recursive = FALSE,
+ use.names = FALSE
+ )
+
+ # Add classes and attributes in some cases to the default
+ if ((all(is.na(item1)) && "POSIXct" %in% class2) ||
+ (all(is.na(item2)) && "POSIXct" %in% class1) ||
+ ("POSIXct" %in% class1 && "POSIXct" %in% class2)) {
+ class(result) <- c("POSIXct", "POSIXt")
+ attr(result, "tzone") <- ""
+ } else if (all_na1 && known_to_dplyr2) {
+ class(result) <- class2
+ } else if (all_na2 && known_to_dplyr1) {
+ class(result) <- class1
+ } else if (identical(class1, class2) && known_to_dplyr1) {
+ class(result) <- class1
+ }
+ }
+ }
+ list(result)
+}
+
+
+
+prepare_table_with_coercion_rules <- function() {
+ items <- list(
+ logicalvalue = TRUE,
+ logicalNA = NA,
+ anotherNA = c(NA, NA),
+ integer = 4L,
+ factor = factor("a"),
+ another_factor = factor("b"),
+ double = 4.5,
+ character = "c",
+ POSIXct = as.POSIXct("2010-01-01"),
+ Date = as.Date("2016-01-01"),
+ complex = 1 + 2i,
+ int_with_class = structure(4L, class = "int_with_class"),
+ num_with_class = structure(4.5, class = "num_with_class")
+ )
+
+ special_non_vector_classes <- c(
+ "factor", "POSIXct", "Date", "table", "AsIs", "integer64"
+ )
+ pairs <- expand.grid(names(items), names(items))
+ pairs$can_combine <- FALSE
+ pairs$warning <- FALSE
+ pairs$item_pair <- vector("list", nrow(pairs))
+ pairs$result <- vector("list", nrow(pairs))
+
+ for (i in seq_len(nrow(pairs))) {
+ item1 <- items[[pairs$Var1[i]]]
+ item2 <- items[[pairs$Var2[i]]]
+ class1 <- class(item1)
+ class2 <- class(item2)
+ all_na1 <- all(is.na(item1))
+ all_na2 <- all(is.na(item2))
+ known_to_dplyr1 <-
+ is.vector(item1) ||
+ any(class1 %in% special_non_vector_classes)
+ known_to_dplyr2 <-
+ is.vector(item2) ||
+ any(class2 %in% special_non_vector_classes)
+
+ pairs$can_combine[i] <- can_be_combined(
+ item1, item2, class1, class2,
+ all_na1, all_na2, known_to_dplyr1, known_to_dplyr2
+ )
+
+ pairs$warning[i] <- give_a_warning(
+ item1, item2,
+ class1, class2,
+ known_to_dplyr1, known_to_dplyr2,
+ can_be_combined = pairs$can_combine[i]
+ )
+
+ pairs$item_pair[[i]] <- list(item1, item2)
+ pairs$result[i] <- combine_result(
+ item1, item2, class1, class2,
+ all_na1, all_na2,
+ known_to_dplyr1, known_to_dplyr2,
+ pairs$can_combine[i], pairs$warning[i]
+ )
+ }
+
+ return(pairs)
+}
+
+print_pairs <- function(pairs) {
+ pairs_printable <- pairs
+ pairs_printable$result <- sapply(
+ pairs$result,
+ function(x) {
+ if (is.null(x)) {
+ ""
+ } else {
+ as.character(x)
+ }
+ }
+ )
+ pairs_printable$result_class <- lapply(
+ pairs$result,
+ function(x) {
+ if (is.null(x)) {
+ ""
+ } else {
+ class(x)
+ }
+ }
+ )
+ pairs_printable <- arrange(
+ pairs_printable, desc(can_combine), warning, Var1, Var2
+ )
+ pairs_printable
+}
+
+combine_coercion_types <- function() {
+ pairs <- prepare_table_with_coercion_rules()
+ # knitr::kable(print_pairs(pairs))
+ for (i in seq_len(nrow(pairs))) {
+ test_that(paste0("Coercion from ", pairs$Var1[i], " to ", pairs$Var2[i]), {
+ combine_pair_test(
+ item_pair = pairs$item_pair[[i]],
+ var1 = pairs$Var1[i],
+ var2 = pairs$Var2[i],
+ result = pairs$result[[i]],
+ can_combine = pairs$can_combine[i],
+ warning = pairs$warning[i]
+ )
+ })
+ }
+}
diff --git a/tests/testthat/helper-encoding.R b/tests/testthat/helper-encoding.R
new file mode 100644
index 0000000..ac94a5a
--- /dev/null
+++ b/tests/testthat/helper-encoding.R
@@ -0,0 +1,53 @@
+get_lang_strings <- function() {
+ lang_strings <- c(
+ de = "Gl\u00fcck",
+ cn = "\u5e78\u798f",
+ ru = "\u0441\u0447\u0430\u0441\u0442\u044c\u0435",
+ ko = "\ud589\ubcf5"
+ )
+
+ native_lang_strings <- enc2native(lang_strings)
+
+ same <- (lang_strings == native_lang_strings)
+
+ list(
+ same = lang_strings[same],
+ different = lang_strings[!same]
+ )
+}
+
+get_native_lang_string <- function() {
+ lang_strings <- get_lang_strings()
+ if (length(lang_strings$same) == 0) testthat::skip("No native language string available")
+ lang_strings$same[[1L]]
+}
+
+get_alien_lang_string <- function() {
+ lang_strings <- get_lang_strings()
+ if (length(lang_strings$different) == 0) testthat::skip("No alien language string available")
+ lang_strings$different[[1L]]
+}
+
+with_non_utf8_encoding <- function(code) {
+ old_encoding <- set_non_utf8_encoding()
+ on.exit(set_encoding(old_encoding), add = TRUE)
+ code
+}
+
+set_non_utf8_encoding <- function() {
+ if (.Platform$OS.type == "windows") return(NULL)
+ tryCatch(
+ locale <- set_encoding("en_US.ISO88591"),
+ warning = function(e) {
+ testthat::skip("Cannot set latin-1 encoding")
+ }
+ )
+ locale
+}
+
+set_encoding <- function(encoding) {
+ if (is.null(encoding)) return(NULL)
+ locale <- Sys.getlocale("LC_CTYPE")
+ Sys.setlocale("LC_CTYPE", encoding)
+ locale
+}
diff --git a/tests/testthat/helper-groups.R b/tests/testthat/helper-groups.R
new file mode 100644
index 0000000..fb4c485
--- /dev/null
+++ b/tests/testthat/helper-groups.R
@@ -0,0 +1,13 @@
+expect_groups <- function(df, groups, info = NULL) {
+ if (length(groups) == 0L) {
+ expect_null(groups(df), info = info)
+ expect_identical(group_vars(df), character(), info = info)
+ } else {
+ expect_identical(groups(df), lapply(enc2native(groups), as.name), info = info)
+ expect_identical(group_vars(df), groups, info = info)
+ }
+}
+
+expect_no_groups <- function(df) {
+ expect_groups(df, NULL)
+}
diff --git a/tests/testthat/helper-hybrid.R b/tests/testthat/helper-hybrid.R
new file mode 100644
index 0000000..ef18544
--- /dev/null
+++ b/tests/testthat/helper-hybrid.R
@@ -0,0 +1,66 @@
+expect_predicate <- function(actual, expected) {
+ if (is.function(expected)) {
+ expect_true(expected(actual))
+ } else {
+ expect_identical(actual, expected)
+ }
+}
+
+check_hybrid_result <- function(expr, ..., expected, test_eval = TRUE) {
+ check_hybrid_result_(rlang::enquo(expr), ..., expected = expected, test_eval = test_eval)
+}
+
+check_hybrid_result_ <- function(expr, ..., expected, test_eval) {
+ expect_error(
+ expect_predicate(with_hybrid_(expr, ...), expected), NA)
+ if (test_eval) {
+ expect_predicate(eval_dots_(expr, ...), expected)
+ }
+}
+
+check_not_hybrid_result <- function(expr, ..., expected, test_eval = TRUE) {
+ check_not_hybrid_result_(rlang::enquo(expr), ..., expected = expected, test_eval = test_eval)
+}
+
+check_not_hybrid_result_ <- function(expr, ..., expected, test_eval) {
+ expect_error(
+ expect_predicate(without_hybrid_(expr, ...), expected), NA)
+ if (test_eval) {
+ expect_predicate(eval_dots_(expr, ...), expected)
+ }
+}
+
+expect_hybrid_error <- function(expr, ..., error) {
+ expect_hybrid_error_(rlang::enquo(expr), ..., error = error)
+}
+
+expect_hybrid_error_ <- function(expr, ..., error) {
+ expect_error(
+ with_hybrid_(expr, ...),
+ error
+ )
+}
+
+expect_not_hybrid_error <- function(expr, ..., error) {
+ expect_not_hybrid_error_(rlang::enquo(expr), ..., error = error)
+}
+
+expect_not_hybrid_error_ <- function(expr, ..., error) {
+ expect_error(
+ without_hybrid_(expr, ...),
+ error
+ )
+}
+
+expect_environments_clean <- function(x, stop_env = parent.frame()) {
+ if (!is.environment(x)) x <- environment(x)
+ if (identical(x, stop_env)) return()
+
+ obj_names <- ls(x, all.names = TRUE)
+ objs <- mget(obj_names, x)
+ lapply(objs, expect_is, "environment")
+
+ expect_environments_clean(parent.env(x), stop_env = stop_env)
+}
+
+bad_hybrid_handler <- function(...) stop("Expected hybrid evaluation")
diff --git a/tests/testthat/helper-output.R b/tests/testthat/helper-output.R
deleted file mode 100644
index 7624050..0000000
--- a/tests/testthat/helper-output.R
+++ /dev/null
@@ -1,5 +0,0 @@
-output_file <- function(filename) file.path("output", filename)
-
-expect_output_file_rel <- function(x, filename) {
- expect_output_file(x, output_file(filename), update = TRUE)
-}
diff --git a/tests/testthat/helper-src.R b/tests/testthat/helper-src.R
deleted file mode 100644
index e0e1b51..0000000
--- a/tests/testthat/helper-src.R
+++ /dev/null
@@ -1,15 +0,0 @@
-test_register_src("df", src_df(env = new.env(parent = emptyenv())))
-test_register_src("sqlite", src_sqlite(":memory:", create = TRUE))
-
-if (identical(Sys.info()[["user"]], "hadley")) {
- test_register_src("postgres", src_postgres("test", host = "localhost"))
-} else if (identical(Sys.getenv("TRAVIS"), "true")) {
- test_register_src("postgres", src_postgres("test", user = "travis", password = ""))
-}
-
-
-
-skip_if_no_sqlite <- function() {
- if (!test_srcs$has("sqlite"))
- skip("No SQLite")
-}
diff --git a/tests/testthat/helper-torture.R b/tests/testthat/helper-torture.R
new file mode 100644
index 0000000..e45d582
--- /dev/null
+++ b/tests/testthat/helper-torture.R
@@ -0,0 +1 @@
+with_gctorture2 <- withr::with_(gctorture2)
diff --git a/tests/testthat/output/iris--70.txt b/tests/testthat/output/iris--70.txt
deleted file mode 100644
index 7f39cbd..0000000
--- a/tests/testthat/output/iris--70.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-Source: query [?? x 5]
-Database: sqlite x.y.z [:memory:]
-
- Sepal.Length Sepal.Width Petal.Length Petal.Width Species
- <dbl> <dbl> <dbl> <dbl> <chr>
-1 5.1 3.5 1.4 0.2 setosa
-2 4.9 3.0 1.4 0.2 setosa
-3 4.7 3.2 1.3 0.2 setosa
-4 4.6 3.1 1.5 0.2 setosa
-5 5.0 3.6 1.4 0.2 setosa
-6 5.4 3.9 1.7 0.4 setosa
-7 4.6 3.4 1.4 0.3 setosa
-8 5.0 3.4 1.5 0.2 setosa
-9 4.4 2.9 1.4 0.2 setosa
-10 4.9 3.1 1.5 0.1 setosa
-# ... with more rows
diff --git a/tests/testthat/output/iris-3-5.txt b/tests/testthat/output/iris-3-5.txt
deleted file mode 100644
index 06bda99..0000000
--- a/tests/testthat/output/iris-3-5.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-Source: query [?? x 5]
-Database: sqlite x.y.z [:memory:]
-
- Sepal.Length
- <dbl>
-1 5.1
-2 4.9
-3 4.7
-# ...
-# with
-# more
-# rows,
-# and
-# 4
-# more
-# variables:
-# Sepal.Width <dbl>,
-# Petal.Length <dbl>,
-# Petal.Width <dbl>,
-# Species <chr>
diff --git a/tests/testthat/output/iris-5-30.txt b/tests/testthat/output/iris-5-30.txt
deleted file mode 100644
index fdc2cff..0000000
--- a/tests/testthat/output/iris-5-30.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-Source: query [?? x 5]
-Database: sqlite x.y.z [:memory:]
-
- Sepal.Length Sepal.Width
- <dbl> <dbl>
-1 5.1 3.5
-2 4.9 3.0
-3 4.7 3.2
-4 4.6 3.1
-5 5.0 3.6
-# ... with more rows, and 3
-# more variables:
-# Petal.Length <dbl>,
-# Petal.Width <dbl>,
-# Species <chr>
diff --git a/tests/testthat/output/iris-head-30-80.txt b/tests/testthat/output/iris-head-30-80.txt
deleted file mode 100644
index 9cbdb3b..0000000
--- a/tests/testthat/output/iris-head-30-80.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-Source: query [?? x 5]
-Database: sqlite x.y.z [:memory:]
-
- Sepal.Length Sepal.Width Petal.Length Petal.Width Species
- <dbl> <dbl> <dbl> <dbl> <chr>
-1 5.1 3.5 1.4 0.2 setosa
-2 4.9 3.0 1.4 0.2 setosa
-3 4.7 3.2 1.3 0.2 setosa
-4 4.6 3.1 1.5 0.2 setosa
-5 5.0 3.6 1.4 0.2 setosa
-6 5.4 3.9 1.7 0.4 setosa
diff --git a/tests/testthat/output/mtcars-8-30.txt b/tests/testthat/output/mtcars-8-30.txt
deleted file mode 100644
index 02d1c02..0000000
--- a/tests/testthat/output/mtcars-8-30.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-Source: query [?? x 11]
-Database: sqlite x.y.z [:memory:]
-
- mpg cyl disp hp
- <dbl> <dbl> <dbl> <dbl>
-1 21.0 6 160.0 110
-2 21.0 6 160.0 110
-3 22.8 4 108.0 93
-4 21.4 6 258.0 110
-5 18.7 8 360.0 175
-6 18.1 6 225.0 105
-7 14.3 8 360.0 245
-8 24.4 4 146.7 62
-# ... with more rows, and 7
-# more variables:
-# drat <dbl>, wt <dbl>,
-# qsec <dbl>, vs <dbl>,
-# am <dbl>, gear <dbl>,
-# carb <dbl>
diff --git a/tests/testthat/test-DBI.R b/tests/testthat/test-DBI.R
new file mode 100644
index 0000000..3583be7
--- /dev/null
+++ b/tests/testthat/test-DBI.R
@@ -0,0 +1,15 @@
+context("DBI")
+
+test_that("can work directly with DBI connection", {
+ skip_if_not_installed("RSQLite")
+ skip_if_not_installed("dbplyr")
+
+ con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
+
+ df <- tibble(x = 1:10, y = letters[1:10])
+ df1 <- copy_to(con, df)
+ df2 <- tbl(con, "df")
+
+ expect_equal(collect(df1), df)
+ expect_equal(collect(df2), df)
+})
diff --git a/tests/testthat/test-arrange.r b/tests/testthat/test-arrange.r
index 17780c7..98d4a5d 100644
--- a/tests/testthat/test-arrange.r
+++ b/tests/testthat/test-arrange.r
@@ -4,7 +4,7 @@ df2 <- data.frame(
a = rep(c(NA, 1, 2, 3), each = 4),
b = rep(c(0L, NA, 1L, 2L), 4),
c = c(NA, NA, NA, NA, letters[10:21]),
- d = rep( c(T, NA, F, T), each = 4),
+ d = rep(c(T, NA, F, T), each = 4),
id = 1:16,
stringsAsFactors = FALSE
)
@@ -39,26 +39,27 @@ test_that("local arrange sorts missing values to end", {
})
test_that("two arranges equivalent to one", {
- df1 <- frame_data(
+ df <- tribble(
~x, ~y,
2, 1,
2, -1,
1, 1
)
- tbls <- test_load(df1)
- single <- df1 %>% arrange(x, y)
- compare_tbls(tbls, function(x) x %>% arrange(y) %>% arrange(x), ref = single)
+ df1 <- df %>% arrange(x, y)
+ df2 <- df %>% arrange(y) %>% arrange(x)
+
+ expect_equal(df1, df2)
})
test_that("arrange handles list columns (#282)", {
- df <- data.frame( a = 2:1 )
- df$b <- list( "foo", "bar" )
+ df <- data.frame(a = 2:1)
+ df$b <- list("foo", "bar")
res <- arrange(df, a)
- expect_equal(res$b, list( "bar", "foo" ) )
+ expect_equal(res$b, list("bar", "foo"))
})
-test_that("arrange handles the case where ... is missing (#338)",{
+test_that("arrange handles the case where ... is missing (#338)", {
expect_equivalent(arrange(mtcars), mtcars)
})
@@ -77,40 +78,40 @@ test_that("grouped arrange ignores group (#491 -> #1206)", {
test_that("arrange keeps the grouping structure (#605)", {
dat <- data_frame(g = c(2, 2, 1, 1), x = c(1, 3, 2, 4))
res <- dat %>% group_by(g) %>% arrange()
- expect_is(res, "grouped_df" )
+ expect_is(res, "grouped_df")
expect_equal(res$x, dat$x)
res <- dat %>% group_by(g) %>% arrange(x)
expect_is(res, "grouped_df")
expect_equal(res$x, 1:4)
- expect_equal(attr(res,"indices"), list( c(1,3), c(0, 2)) )
+ expect_equal(attr(res, "indices"), list(c(1, 3), c(0, 2)))
})
test_that("arrange handles complex vectors", {
- d <- data.frame(x=1:10,y=10:1+2i)
- res <- arrange(d,y)
- expect_equal( res$y, rev(d$y) )
- expect_equal( res$x, rev(d$x) )
+ d <- data.frame(x = 1:10, y = 10:1 + 2i)
+ res <- arrange(d, y)
+ expect_equal(res$y, rev(d$y))
+ expect_equal(res$x, rev(d$x))
res <- arrange(res, desc(y))
- expect_equal( res$y, d$y )
- expect_equal( res$x, d$x )
+ expect_equal(res$y, d$y)
+ expect_equal(res$x, d$x)
- d$y[ c(3,6) ] <- NA
- res <- arrange(d,y)
- expect_true( all(is.na(res$y[9:10])) )
+ d$y[c(3, 6)] <- NA
+ res <- arrange(d, y)
+ expect_true(all(is.na(res$y[9:10])))
- res <- arrange(d,desc(y))
- expect_true( all(is.na(res$y[9:10])) )
+ res <- arrange(d, desc(y))
+ expect_true(all(is.na(res$y[9:10])))
})
test_that("arrange respects attributes #1105", {
env <- environment()
- Period <- suppressWarnings( setClass("Period", contains = "numeric", where = env) )
+ Period <- suppressWarnings(setClass("Period", contains = "numeric", where = env))
on.exit(removeClass("Period", where = env))
- df <- data.frame( p = Period(c(1, 2, 3)), x = 1:3 )
+ df <- data.frame(p = Period(c(1, 2, 3)), x = 1:3)
res <- arrange(df, p)
expect_is(res$p, "Period")
})
@@ -118,45 +119,78 @@ test_that("arrange respects attributes #1105", {
test_that("arrange works with empty data frame (#1142)", {
df <- data.frame()
res <- df %>% arrange
- expect_equal( nrow(res), 0L )
- expect_equal( length(res), 0L )
+ expect_equal(nrow(res), 0L)
+ expect_equal(length(res), 0L)
})
test_that("arrange respects locale (#1280)", {
- df2 <- data_frame( words = c("casa", "\u00e1rbol", "zona", "\u00f3rgano") )
+ df2 <- data_frame(words = c("casa", "\u00e1rbol", "zona", "\u00f3rgano"))
- res <- df2 %>% arrange( words )
- expect_equal( res$words, sort(df2$words) )
+ res <- df2 %>% arrange(words)
+ expect_equal(res$words, sort(df2$words))
- res <- df2 %>% arrange( desc(words) )
- expect_equal( res$words, sort(df2$words, decreasing = TRUE) )
+ res <- df2 %>% arrange(desc(words))
+ expect_equal(res$words, sort(df2$words, decreasing = TRUE))
})
test_that("duplicated column name is explicit about which column (#996)", {
- df <- data.frame( x = 1:10, x = 1:10 )
- names(df) <- c("x", "x")
- expect_error( df %>% arrange, "found duplicated column name: x|unique name.*'x'" )
+ df <- data.frame(x = 1:10, x = 1:10)
+ names(df) <- c("x", "x")
+
+ # Error message created by tibble
+ expect_error(df %>% arrange)
+
+ df <- data.frame(x = 1:10, x = 1:10, y = 1:10, y = 1:10)
+ names(df) <- c("x", "x", "y", "y")
- df <- data.frame( x = 1:10, x = 1:10, y = 1:10, y = 1:10 )
- names(df) <- c("x", "x", "y", "y")
- expect_error( df %>% arrange, "found duplicated column name: x, y|unique name.*'x', 'y'" )
+ # Error message created by tibble
+ expect_error(df %>% arrange)
})
test_that("arrange fails gracefully on list columns (#1489)", {
df <- expand.grid(group = 1:2, y = 1, x = 1) %>%
group_by(group) %>%
do(fit = lm(data = ., y ~ x))
- expect_error( arrange(df, fit), "Unsupported vector type list" )
+ expect_error(
+ arrange(df, fit),
+ "Argument 1 is of unsupported type list",
+ fixed = TRUE
+ )
})
test_that("arrange fails gracefully on raw columns (#1803)", {
df <- data_frame(a = 1:3, b = as.raw(1:3))
- expect_error( arrange(df, a), "unsupported type" )
- expect_error( arrange(df, b), "unsupported type" )
+ expect_error(
+ arrange(df, a),
+ "Column `b` is of unsupported type raw",
+ fixed = TRUE
+ )
+ expect_error(
+ arrange(df, b),
+ "Column `b` is of unsupported type raw",
+ fixed = TRUE
+ )
})
test_that("arrange fails gracefully on matrix input (#1870)", {
df <- data_frame(a = 1:3, b = 4:6)
- expect_error( arrange(df, is.na(df)), "matrix" )
+ expect_error(
+ arrange(df, is.na(df)),
+ "Argument 1 is of unsupported type matrix",
+ fixed = TRUE
+ )
})
+
+
+# grouped_df --------------------------------------------------------------
+
+test_that("can choose to inclue grouping vars", {
+ df <- tibble(g = c(1, 2), x = c(2, 1)) %>% group_by(g)
+
+ df1 <- df %>% arrange(x, .by_group = TRUE)
+ df2 <- df %>% arrange(g, x)
+
+ expect_equal(df1, df2)
+})
+
diff --git a/tests/testthat/test-as-data-frame.R b/tests/testthat/test-as-data-frame.R
deleted file mode 100644
index dbcec64..0000000
--- a/tests/testthat/test-as-data-frame.R
+++ /dev/null
@@ -1,44 +0,0 @@
-context("as-data-frame")
-
-
-# as.data.frame and as_data_frame -----------------------------------------
-
-test_that("as.data.frame works for SQL sources", {
- lf1 <- memdb_frame(x = letters)
- out <- lf1 %>%
- as.data.frame()
-
- expect_equal(out, data.frame(x = letters, stringsAsFactors = FALSE))
-})
-
-test_that("as_data_frame works for SQL sources", {
- if (packageVersion("tibble") < "1.0-4")
- skip("need tibble 1.0-4 or later for this test")
-
- lf1 <- memdb_frame(x = letters)
- out <- lf1 %>%
- as_data_frame()
-
- expect_equal(out, data_frame(x = letters))
-})
-
-test_that("as.data.frame is unlimited", {
- x <- rep(1:2, formals(collect.tbl_sql)$n)
- lf1 <- memdb_frame(x = x)
- out <- lf1 %>%
- as.data.frame()
-
- expect_equal(out, data.frame(x = x))
-})
-
-test_that("as_data_frame is unlimited", {
- if (packageVersion("tibble") < "1.0-4")
- skip("need tibble 1.0-4 or later for this test")
-
- x <- rep(1:2, formals(collect.tbl_sql)$n)
- lf1 <- memdb_frame(x = x)
- out <- lf1 %>%
- as_data_frame()
-
- expect_equal(out, data_frame(x = x))
-})
diff --git a/tests/testthat/test-astyle.R b/tests/testthat/test-astyle.R
new file mode 100644
index 0000000..9ec8dd5
--- /dev/null
+++ b/tests/testthat/test-astyle.R
@@ -0,0 +1,9 @@
+context("astyle")
+
+test_that("source code formatting", {
+ skip_on_cran()
+ skip_on_os("windows")
+ skip_on_travis()
+
+ expect_warning(astyle("--dry-run"), NA)
+})
diff --git a/tests/testthat/test-between.R b/tests/testthat/test-between.R
new file mode 100644
index 0000000..52ca204
--- /dev/null
+++ b/tests/testthat/test-between.R
@@ -0,0 +1,21 @@
+context("between")
+
+test_that("returns NA if any argument is NA", {
+ expect_equal(between(1, 1, NA), NA)
+ expect_equal(between(1, NA, 1), NA)
+ expect_equal(between(NA, 1, 1), NA)
+})
+
+test_that("compatible with base R", {
+ x <- runif(1e3)
+ expect_equal(between(x, 0.25, 0.5), x >= 0.25 & x <= 0.5)
+})
+
+test_that("warns when called on S3 object", {
+ expect_warning(between(factor(1:5), 1, 3), "numeric vector with S3 class")
+})
+
+test_that("unless it's a date or date time", {
+ expect_warning(between(Sys.Date(), 1, 3), NA)
+ expect_warning(between(Sys.time(), 1, 3), NA)
+})
diff --git a/tests/testthat/test-binds.R b/tests/testthat/test-binds.R
index e1edaff..06eb694 100644
--- a/tests/testthat/test-binds.R
+++ b/tests/testthat/test-binds.R
@@ -1,5 +1,36 @@
context("binds")
+
+# error -------------------------------------------------------------------
+
+test_that("bind_rows() and bind_cols() err for non-data frames (#2373)", {
+ df1 <- data_frame(x = 1)
+ df2 <- structure(list(x = 1), class = "blah_frame")
+
+ expect_error(
+ bind_cols(df1, df2),
+ "Argument 2 must be a data frame or a named atomic vector, not a blah_frame",
+ fixed = TRUE
+ )
+ expect_error(
+ bind_rows(df1, df2),
+ "Argument 2 must be a data frame or a named atomic vector, not a blah_frame",
+ fixed = TRUE
+ )
+})
+
+test_that("bind_rows() err for invalid ID", {
+ df1 <- data_frame(x = 1:3)
+ df2 <- data_frame(x = 4:6)
+
+ expect_error(
+ bind_rows(df1, df2, .id = 5),
+ "`.id` must be a scalar string, not double of length 1",
+ fixed = TRUE
+ )
+})
+
+
# columns -----------------------------------------------------------------
test_that("cbind uses shallow copies", {
@@ -7,11 +38,11 @@ test_that("cbind uses shallow copies", {
int = 1:10,
num = rnorm(10),
cha = letters[1:10],
- stringsAsFactors = FALSE )
+ stringsAsFactors = FALSE)
df2 <- data.frame(
- log = sample(c(T,F), 10, replace = TRUE),
- dat = seq.Date( Sys.Date(), length.out = 10, by = "day" ),
- tim = seq( Sys.time(), length.out = 10, by = "1 hour" )
+ log = sample(c(T, F), 10, replace = TRUE),
+ dat = seq.Date(Sys.Date(), length.out = 10, by = "day"),
+ tim = seq(Sys.time(), length.out = 10, by = "1 hour")
)
df <- bind_cols(df1, df2)
@@ -22,13 +53,34 @@ test_that("cbind uses shallow copies", {
test_that("bind_cols handles lists (#1104)", {
exp <- data_frame(x = 1, y = "a", z = 2)
- l1 <- list(x = 1, y = 'a')
+ l1 <- list(x = 1, y = "a")
l2 <- list(z = 2)
expect_equal(bind_cols(l1, l2), exp)
expect_equal(bind_cols(list(l1, l2)), exp)
})
+test_that("bind_cols handles empty argument list (#1963)", {
+ expect_equal(bind_cols(), data.frame())
+})
+
+test_that("bind_cols handles all-NULL values (#2303)", {
+ expect_identical(bind_cols(list(a = NULL, b = NULL)), data.frame())
+ expect_identical(bind_cols(NULL), data.frame())
+})
+
+test_that("bind_cols repairs names", {
+ df <- tibble(a = 1, b = 2)
+ bound <- bind_cols(df, df)
+
+ repaired <- as_tibble(tibble::repair_names(
+ data.frame(a = 1, b = 2, a = 1, b = 2, check.names = FALSE)
+ ))
+
+ expect_equal(bound, repaired)
+})
+
+
# rows --------------------------------------------------------------------
df_var <- data_frame(
@@ -56,16 +108,26 @@ test_that("bind_rows reorders columns", {
)
})
-test_that("bind_rows ignores NULL",{
+test_that("bind_rows ignores NULL", {
df <- data_frame(a = 1)
expect_equal(bind_rows(df, NULL), df)
expect_equal(bind_rows(list(df, NULL)), df)
})
-test_that("bind_rows only accepts data frames #288",{
- ll <- list(1:5, 6:10)
- expect_error(bind_rows(ll), "cannot convert")
+test_that("bind_rows only accepts data frames or named vectors", {
+ ll <- list(1:5, rlang::get_env())
+ expect_error(
+ bind_rows(ll),
+ "Argument 1 must have names",
+ fixed = TRUE
+ )
+ ll <- list(tibble(a = 1:5), rlang::get_env())
+ expect_error(
+ bind_rows(ll),
+ "Argument 2 must be a data frame or a named atomic vector, not a environment",
+ fixed = TRUE
+ )
})
test_that("bind_rows handles list columns (#463)", {
@@ -81,7 +143,7 @@ test_that("can bind lists of data frames #1389", {
expect_equal(nrow(res), 4)
})
-test_that("bind_rows handles data frames with no rows (#597)",{
+test_that("bind_rows handles data frames with no rows (#597)", {
df1 <- data_frame(x = 1, y = factor("a"))
df0 <- df1[0, ]
@@ -98,7 +160,35 @@ test_that("bind_rows handles data frames with no columns (#1346)", {
expect_equal(dim(bind_rows(df0, df0)), c(2, 0))
res <- bind_rows(df0, df1)
- expect_equal(res$x, c(1, NA))
+ expect_equal(res$x, c(NA, 1))
+})
+
+test_that("bind_rows handles lists with NULL values (#2056)", {
+ df1 <- data_frame(x = 1, y = 1)
+ df2 <- data_frame(x = 2, y = 2)
+ lst1 <- list(a = df1, NULL, b = df2)
+
+ df3 <- data_frame(
+ names = c("a", "b"),
+ x = c(1, 2),
+ y = c(1, 2)
+ )
+
+ expect_equal(bind_rows(lst1, .id = "names"), df3)
+})
+
+test_that("bind_rows handles lists with list() values (#2826)", {
+ expect_equal(bind_rows(list(iris, list())), iris)
+})
+
+test_that("bind_rows puts data frames in order received even if no columns (#2175)", {
+ df2 <- data_frame(x = 2, y = "b")
+ df1 <- df2[, 0]
+
+ res <- bind_rows(df1, df2)
+
+ expect_equal(res$x, c(NA, 2))
+ expect_equal(res$y, c(NA, "b"))
})
# Column coercion --------------------------------------------------------------
@@ -112,20 +202,25 @@ test_that("bind_rows promotes integer to numeric", {
expect_equal(typeof(res$b), "integer")
})
-test_that("bind_rows promotes logical to integer", {
+test_that("bind_rows does not coerce logical to integer", {
df1 <- data_frame(a = FALSE)
df2 <- data_frame(a = 1L)
- res <- bind_rows(df1, df2)
- expect_equal(res$a, c(0L, 1L))
+ expect_error(
+ bind_rows(df1, df2),
+ "Column `a` can't be converted from logical to integer",
+ fixed = TRUE
+ )
})
test_that("bind_rows promotes factor to character with warning", {
- df1 <- data_frame(a = factor("a"))
- df2 <- data_frame(a = "b")
+ df1 <- data_frame(a = factor("a"))
+ df2 <- data_frame(a = "b")
- expect_warning(res <- bind_rows(df1, df2),
- "binding factor and character vector, coercing into character vector" )
+ expect_warning(
+ res <- bind_rows(df1, df2),
+ "binding factor and character vector, coercing into character vector"
+ )
expect_equal(typeof(res$a), "character")
})
@@ -153,69 +248,79 @@ test_that("bind_rows doesn't promote integer/numeric to factor", {
df2 <- data_frame(a = 1L)
df3 <- data_frame(a = 1)
- expect_error(bind_rows(df1, df2), "from factor to integer")
- expect_error(bind_rows(df1, df3), "from factor to numeric")
+ expect_error(
+ bind_rows(df1, df2),
+ "Column `a` can't be converted from factor to integer",
+ fixed = TRUE
+ )
+ expect_error(
+ bind_rows(df1, df3),
+ "Column `a` can't be converted from factor to numeric",
+ fixed = TRUE
+ )
})
test_that("bind_rows preserves timezones #298", {
- dates1 <- data.frame(ID=c("a", "b", "c"),
+ dates1 <- data.frame(ID = c("a", "b", "c"),
dates = structure(c(-247320000, -246196800, -245073600),
tzone = "GMT",
class = c("POSIXct", "POSIXt")),
- stringsAsFactors=FALSE)
+ stringsAsFactors = FALSE)
- dates2 <- data.frame(ID=c("d", "e", "f"),
- dates=structure(c(-243864000, -242654400, -241444800),
+ dates2 <- data.frame(ID = c("d", "e", "f"),
+ dates = structure(c(-243864000, -242654400, -241444800),
tzone = "GMT",
class = c("POSIXct", "POSIXt")),
- stringsAsFactors=FALSE)
+ stringsAsFactors = FALSE)
alldates <- bind_rows(dates1, dates2)
- expect_equal( attr( alldates$dates, "tzone" ), "GMT" )
+ expect_equal(attr(alldates$dates, "tzone"), "GMT")
})
test_that("bind_rows handles all NA columns (#493)", {
mydata <- list(
- data.frame(x=c("foo", "bar")),
- data.frame(x=NA)
+ data.frame(x = c("foo", "bar")),
+ data.frame(x = NA)
)
res <- bind_rows(mydata)
- expect_true( is.na(res$x[3]) )
- expect_is( res$x, "factor" )
+ expect_true(is.na(res$x[3]))
+ expect_is(res$x, "factor")
mydata <- list(
- data.frame(x=NA),
- data.frame(x=c("foo", "bar"))
+ data.frame(x = NA),
+ data.frame(x = c("foo", "bar"))
)
res <- bind_rows(mydata)
- expect_true( is.na(res$x[1]) )
- expect_is( res$x, "factor" )
+ expect_true(is.na(res$x[1]))
+ expect_is(res$x, "factor")
})
-test_that( "bind_rows handles complex. #933", {
- df1 <- data.frame(r = c(1+1i, 2-1i))
- df2 <- data.frame(r = c(1-1i, 2+1i))
- df3 <- bind_rows(df1,df2)
- expect_equal( nrow(df3), 4L)
- expect_equal( df3$r, c(df1$r, df2$r) )
+test_that("bind_rows handles complex. #933", {
+ df1 <- data.frame(r = c(1 + 1i, 2 - 1i))
+ df2 <- data.frame(r = c(1 - 1i, 2 + 1i))
+ df3 <- bind_rows(df1, df2)
+ expect_equal(nrow(df3), 4L)
+ expect_equal(df3$r, c(df1$r, df2$r))
})
test_that("bind_rows is careful about column names encoding #1265", {
- one <- data.frame(foo=1:3, bar=1:3); names(one) <- c("f\u00fc", "bar")
- two <- data.frame(foo=1:3, bar=1:3); names(two) <- c("f\u00fc", "bar")
+ one <- data.frame(foo = 1:3, bar = 1:3)
+ names(one) <- c("f\u00fc", "bar")
+ two <- data.frame(foo = 1:3, bar = 1:3)
+ names(two) <- c("f\u00fc", "bar")
Encoding(names(one)[1]) <- "UTF-8"
- expect_equal( names(one), names(two))
- res <- bind_rows(one,two)
+ expect_equal(names(one), names(two))
+ res <- bind_rows(one, two)
expect_equal(ncol(res), 2L)
})
test_that("bind_rows handles POSIXct (#1125)", {
df1 <- data.frame(date = as.POSIXct(NA))
df2 <- data.frame(date = as.POSIXct("2015-05-05"))
- res <- bind_rows(df1,df2)
- expect_equal(nrow(res),2L)
+ res <- bind_rows(df1, df2)
+ expect_equal(nrow(res), 2L)
expect_true(is.na(res$date[1]))
})
@@ -224,22 +329,22 @@ test_that("bind_rows respects ordered factors (#1112)", {
id <- factor(c("a", "c", "d"), levels = l, ordered = TRUE)
df <- data.frame(id = rep(id, 2), val = rnorm(6))
res <- bind_rows(df, df)
- expect_is( res$id, "ordered")
- expect_equal( levels(df$id), levels(res$id) )
+ expect_is(res$id, "ordered")
+ expect_equal(levels(df$id), levels(res$id))
res <- group_by(df, id) %>% do(na.omit(.))
- expect_is( res$id, "ordered")
- expect_equal( levels(df$id), levels(res$id) )
+ expect_is(res$id, "ordered")
+ expect_equal(levels(df$id), levels(res$id))
})
test_that("bind_rows can handle lists (#1104)", {
- my_list <- list(list(x = 1, y = 'a'), list(x = 2, y = 'b'))
+ my_list <- list(tibble(x = 1, y = "a"), tibble(x = 2, y = "b"))
res <- bind_rows(my_list)
expect_equal(nrow(res), 2L)
expect_is(res$x, "numeric")
expect_is(res$y, "character")
- res <- bind_rows(list(x = 1, y = 'a'), list(x = 2, y = 'b'))
+ res <- bind_rows(list(x = 1, y = "a"), list(x = 2, y = "b"))
expect_equal(nrow(res), 2L)
expect_is(res$x, "numeric")
expect_is(res$y, "character")
@@ -247,11 +352,11 @@ test_that("bind_rows can handle lists (#1104)", {
test_that("bind_rows keeps ordered factors (#948)", {
y <- bind_rows(
- data.frame(x=factor(c(1,2,3),ordered=TRUE)),
- data.frame(x=factor(c(1,2,3),ordered=TRUE))
+ data.frame(x = factor(c(1, 2, 3), ordered = TRUE)),
+ data.frame(x = factor(c(1, 2, 3), ordered = TRUE))
)
- expect_is( y$x, "ordered" )
- expect_equal( levels(y$x), as.character(1:3) )
+ expect_is(y$x, "ordered")
+ expect_equal(levels(y$x), as.character(1:3))
})
test_that("bind handles POSIXct of different tz ", {
@@ -259,24 +364,24 @@ test_that("bind handles POSIXct of different tz ", {
date2 <- structure(-1735660800, tzone = "UTC", class = c("POSIXct", "POSIXt"))
date3 <- structure(-1735660800, class = c("POSIXct", "POSIXt"))
- df1 <- data.frame( date = date1 )
- df2 <- data.frame( date = date2 )
- df3 <- data.frame( date = date3 )
+ df1 <- data.frame(date = date1)
+ df2 <- data.frame(date = date2)
+ df3 <- data.frame(date = date3)
res <- bind_rows(df1, df2)
- expect_equal( attr(res$date, "tzone"), "UTC" )
+ expect_equal(attr(res$date, "tzone"), "UTC")
res <- bind_rows(df1, df3)
- expect_equal( attr(res$date, "tzone"), "America/Chicago" )
+ expect_equal(attr(res$date, "tzone"), "America/Chicago")
res <- bind_rows(df2, df3)
- expect_equal( attr(res$date, "tzone"), "UTC" )
+ expect_equal(attr(res$date, "tzone"), "UTC")
res <- bind_rows(df3, df3)
- expect_equal( attr(res$date, "tzone"), NULL )
+ expect_equal(attr(res$date, "tzone"), NULL)
res <- bind_rows(df1, df2, df3)
- expect_equal( attr(res$date, "tzone"), "UTC" )
+ expect_equal(attr(res$date, "tzone"), "UTC")
})
@@ -303,30 +408,30 @@ test_that("string vectors are filled with NA not blanks before collection (#595)
two$char_col <- letters[1:22]
res <- bind_rows(one, two)
- expect_true( all(is.na(res$char_col[1:10])) )
+ expect_true(all(is.na(res$char_col[1:10])))
})
test_that("bind_rows handles POSIXct stored as integer (#1402)", {
now <- Sys.time()
df1 <- data.frame(time = now)
- expect_equal( class(bind_rows(df1)$time), c("POSIXct", "POSIXt") )
+ expect_equal(class(bind_rows(df1)$time), c("POSIXct", "POSIXt"))
df2 <- data.frame(time = seq(now, length.out = 1, by = 1))
- expect_equal( class(bind_rows(df2)$time), c("POSIXct", "POSIXt") )
+ expect_equal(class(bind_rows(df2)$time), c("POSIXct", "POSIXt"))
- res <- bind_rows( df1, df2 )
- expect_equal( class(res$time), c("POSIXct", "POSIXt") )
- expect_true( all(res$time == c(df1$time, df2$time) ) )
+ res <- bind_rows(df1, df2)
+ expect_equal(class(res$time), c("POSIXct", "POSIXt"))
+ expect_true(all(res$time == c(df1$time, df2$time)))
})
test_that("bind_cols accepts NULL (#1148)", {
- df1 <- data_frame(a=1:10, b = 1:10)
- df2 <- data_frame(c=1:10, d = 1:10)
+ df1 <- data_frame(a = 1:10, b = 1:10)
+ df2 <- data_frame(c = 1:10, d = 1:10)
- res1 <- bind_cols(df1,df2)
- res2 <- bind_cols(NULL,df1,df2)
- res3 <- bind_cols(df1, NULL,df2)
+ res1 <- bind_cols(df1, df2)
+ res2 <- bind_cols(NULL, df1, df2)
+ res3 <- bind_cols(df1, NULL, df2)
res4 <- bind_cols(df1, df2, NULL)
expect_equal(res1, res2)
@@ -335,22 +440,38 @@ test_that("bind_cols accepts NULL (#1148)", {
})
test_that("bind_rows handles 0-length named list (#1515)", {
- res <- bind_rows(list(a=1)[-1])
- expect_equal( nrow(res), 0L)
- expect_is(res, "data.frame")
- expect_equal( ncol(res), 0L)
+ res <- bind_rows(list(a = 1)[-1])
+ expect_equal(nrow(res), 0L)
+ expect_is(res, "data.frame")
+ expect_equal(ncol(res), 0L)
})
test_that("bind_rows handles promotion to strings (#1538)", {
- df1 <- data_frame(b=c(1,2))
- df2 <- data_frame(b=c(1L,2L))
- df3 <- data_frame(b=factor(c("A","B")))
- df4 <- data_frame(b=c("C","D"))
-
- expect_error( bind_rows(df1,df3) )
- expect_error( bind_rows(df1,df4) )
- expect_error( bind_rows(df2,df3) )
- expect_error( bind_rows(df2,df4) )
+ df1 <- data_frame(b = c(1, 2))
+ df2 <- data_frame(b = c(1L, 2L))
+ df3 <- data_frame(b = factor(c("A", "B")))
+ df4 <- data_frame(b = c("C", "D"))
+
+ expect_error(
+ bind_rows(df1, df3),
+ "Column `b` can't be converted from numeric to factor",
+ fixed = TRUE
+ )
+ expect_error(
+ bind_rows(df1, df4),
+ "Column `b` can't be converted from numeric to character",
+ fixed = TRUE
+ )
+ expect_error(
+ bind_rows(df2, df3),
+ "Column `b` can't be converted from integer to factor",
+ fixed = TRUE
+ )
+ expect_error(
+ bind_rows(df2, df4),
+ "Column `b` can't be converted from integer to character",
+ fixed = TRUE
+ )
})
test_that("bind_rows infers classes from first result (#1692)", {
@@ -360,14 +481,14 @@ test_that("bind_rows infers classes from first result (#1692)", {
d4 <- rowwise(d1)
d5 <- list(a = 1:10, b = rep(1:2, each = 5))
- expect_equal( class(bind_rows(d1,d1)), "data.frame" )
- expect_equal( class(bind_rows(d2,d1)), c("tbl_df", "tbl", "data.frame") )
- res3 <- bind_rows(d3,d1)
- expect_equal( class(res3), c("grouped_df", "tbl_df", "tbl", "data.frame") )
- expect_equal( attr(res3, "group_sizes"), c(10,10) )
- expect_equal( class(bind_rows(d4,d1)), c("rowwise_df", "tbl_df", "tbl", "data.frame") )
- expect_equal( class(bind_rows(d5,d1)), c("tbl_df", "tbl", "data.frame") )
+ expect_equal(class(bind_rows(d1, d1)), "data.frame")
+ expect_equal(class(bind_rows(d2, d1)), c("tbl_df", "tbl", "data.frame"))
+ res3 <- bind_rows(d3, d1)
+ expect_equal(class(res3), c("grouped_df", "tbl_df", "tbl", "data.frame"))
+ expect_equal(attr(res3, "group_sizes"), c(10, 10))
+ expect_equal(class(bind_rows(d4, d1)), c("rowwise_df", "tbl_df", "tbl", "data.frame"))
+ expect_equal(class(bind_rows(d5, d1)), c("tbl_df", "tbl", "data.frame"))
})
test_that("bind_cols infers classes from first result (#1692)", {
@@ -377,18 +498,114 @@ test_that("bind_cols infers classes from first result (#1692)", {
d4 <- rowwise(d2)
d5 <- list(c = 1:10, d = rep(1:2, each = 5))
- expect_equal( class(bind_cols(d1,d1)), "data.frame" )
- expect_equal( class(bind_cols(d2,d1)), c("tbl_df", "tbl", "data.frame") )
- res3 <- bind_cols(d3,d1)
- expect_equal( class(res3), c("grouped_df", "tbl_df", "tbl", "data.frame") )
- expect_equal( attr(res3, "group_sizes"), c(5,5) )
- expect_equal( class(bind_rows(d4,d1)), c("rowwise_df", "tbl_df", "tbl", "data.frame") )
- expect_equal( class(bind_rows(d5,d1)), c("tbl_df", "tbl", "data.frame") )
-
+ expect_equal(class(bind_cols(d1, d1)), "data.frame")
+ expect_equal(class(bind_cols(d2, d1)), c("tbl_df", "tbl", "data.frame"))
+ res3 <- bind_cols(d3, d1)
+ expect_equal(class(res3), c("grouped_df", "tbl_df", "tbl", "data.frame"))
+ expect_equal(attr(res3, "group_sizes"), c(5, 5))
+ expect_equal(class(bind_cols(d4, d1)), c("rowwise_df", "tbl_df", "tbl", "data.frame"))
+ expect_equal(class(bind_cols(d5, d1)), c("tbl_df", "tbl", "data.frame"))
})
test_that("bind_rows rejects POSIXlt columns (#1789)", {
df <- data_frame(x = Sys.time() + 1:12)
df$y <- as.POSIXlt(df$x)
- expect_error(bind_rows(df, df), "not supported")
+ expect_error(
+ bind_rows(df, df),
+ "Argument 2 can't be a list containing POSIXlt values",
+ fixed = TRUE
+ )
+})
+
+test_that("bind_rows rejects data frame columns (#2015)", {
+ df <- list(
+ x = 1:10,
+ y = data.frame(a = 1:10, y = 1:10)
+ )
+ class(df) <- "data.frame"
+ attr(df, "row.names") <- .set_row_names(10)
+
+ expect_error(
+ dplyr::bind_rows(df, df),
+ "Argument 2 can't be a list containing data frames",
+ fixed = TRUE
+ )
+})
+
+test_that("bind_rows accepts difftime objects", {
+ df1 <- data.frame(x = as.difftime(1, units = "hours"))
+ df2 <- data.frame(x = as.difftime(1, units = "mins"))
+ res <- bind_rows(df1, df2)
+ expect_equal(res$x, as.difftime(c(3600, 60), units = "secs"))
+})
+
+test_that("bind_rows accepts hms objects", {
+ df1 <- data.frame(x = hms::hms(hours = 1))
+ df2 <- data.frame(x = as.difftime(1, units = "mins"))
+ res <- bind_rows(df1, df2)
+ expect_equal(res$x, hms::hms(hours = c(1, 0), minutes = c(0, 1)))
+})
+
+test_that("bind_rows() fails with unnamed vectors", {
+ expect_error(
+ bind_rows(1:2),
+ "Argument 1 must have names",
+ fixed = TRUE
+ )
+})
+
+test_that("bind_rows() handles rowwise vectors", {
+ expect_warning(regex = "character and factor",
+ tbl <- bind_rows(
+ tibble(a = "foo", b = "bar"),
+ c(a = "A", b = "B"),
+ set_names(factor(c("B", "B")), c("a", "b"))
+ ))
+ expect_identical(tbl, tibble(a = c("foo", "A", "B"), b = c("bar", "B", "B")))
+
+ id_tbl <- bind_rows(a = c(a = 1, b = 2), b = c(a = 3, b = 4), .id = "id")
+ expect_identical(id_tbl, tibble(id = c("a", "b"), a = c(1, 3), b = c(2, 4)))
+})
+
+test_that("bind_rows() accepts lists of dataframe-like lists as first argument", {
+ expect_identical(bind_rows(list(list(a = 1, b = 2))), tibble(a = 1, b = 2))
+})
+
+
+# Vectors ------------------------------------------------------------
+
+test_that("accepts named columns", {
+ expect_identical(bind_cols(a = 1:2, b = 3:4), tibble(a = 1:2, b = 3:4))
+ expect_equal(bind_cols(!!! mtcars), as_tibble(mtcars))
})
+
+test_that("uncompatible sizes fail", {
+ expect_error(
+ bind_cols(a = 1, mtcars),
+ "Argument 2 must be length 32, not 1",
+ fixed = TRUE
+ )
+ expect_error(
+ bind_cols(mtcars, a = 1),
+ "Argument 2 must be length 1, not 32",
+ fixed = TRUE
+ )
+})
+
+test_that("unnamed vectors fail", {
+ expect_error(
+ bind_cols(1:2),
+ "Argument 1 must have names",
+ fixed = TRUE
+ )
+ expect_error(
+ bind_cols(!!! list(1:2)),
+ "Argument 1 must have names",
+ fixed = TRUE
+ )
+})
+
+test_that("supports NULL values", {
+ expect_identical(bind_cols(a = 1, NULL, b = 2, NULL), tibble(a = 1, b = 2))
+})
+
diff --git a/tests/testthat/test-case-when.R b/tests/testthat/test-case-when.R
index e689596..994e614 100644
--- a/tests/testthat/test-case-when.R
+++ b/tests/testthat/test-case-when.R
@@ -1,7 +1,11 @@
context("case_when")
test_that("zero inputs throws an error", {
- expect_error(case_when(), "No cases provided")
+ expect_error(
+ case_when(),
+ "No cases provided",
+ fixed = TRUE
+ )
})
test_that("error messages", {
@@ -9,7 +13,7 @@ test_that("error messages", {
case_when(
paste(50)
),
- "Case 1 (paste(50)) is not a two-sided formula",
+ "Case 1 (`paste(50)`) must be a two-sided formula, not a string",
fixed = TRUE
)
@@ -17,7 +21,7 @@ test_that("error messages", {
case_when(
50 ~ 1:3
),
- "LHS of case 1 (50) is double, not logical",
+ "LHS of case 1 (`50`) must be a logical, not double",
fixed = TRUE
)
})
@@ -26,9 +30,10 @@ test_that("cases must yield compatible lengths", {
expect_error(
case_when(
c(TRUE, FALSE) ~ 1,
- c(FALSE, TRUE, FALSE) ~ 2
+ c(FALSE, TRUE, FALSE) ~ 2,
+ c(FALSE, TRUE, FALSE, NA) ~ 3
),
- "LHS of case 1 (c(TRUE, FALSE)) is length 2",
+ "`c(FALSE, TRUE, FALSE) ~ 2`, `c(FALSE, TRUE, FALSE, NA) ~ 3` must be length 2 or one, not 3, 4",
fixed = TRUE
)
@@ -37,7 +42,7 @@ test_that("cases must yield compatible lengths", {
c(TRUE, FALSE) ~ 1:3,
c(FALSE, TRUE) ~ 1:2
),
- "RHS of case 1 (1:3) is length 3",
+ "`c(TRUE, FALSE) ~ 1:3` must be length 2 or one, not 3",
fixed = TRUE
)
})
@@ -64,3 +69,59 @@ test_that("unmatched gets missing value", {
c(1, 2, NA)
)
})
+
+test_that("missing values can be replaced (#1999)", {
+ x <- c(1:3, NA)
+ expect_equal(
+ case_when(
+ x <= 1 ~ 1,
+ x <= 2 ~ 2,
+ is.na(x) ~ 0
+ ),
+ c(1, 2, NA, 0)
+ )
+})
+
+test_that("NA conditions (#2927)", {
+ expect_equal(
+ case_when(
+ c(TRUE, FALSE, NA) ~ 1:3,
+ TRUE ~ 4L
+ ),
+ c(1L, 4L, 4L)
+ )
+})
+
+test_that("atomic conditions (#2909)", {
+ expect_equal(
+ case_when(
+ TRUE ~ 1:3,
+ FALSE ~ 4:6
+ ),
+ 1:3
+ )
+ expect_equal(
+ case_when(
+ NA ~ 1:3,
+ TRUE ~ 4:6
+ ),
+ 4:6
+ )
+})
+
+test_that("zero-length conditions and values (#3041)", {
+ expect_equal(
+ case_when(
+ TRUE ~ integer(),
+ FALSE ~ integer()
+ ),
+ integer()
+ )
+ expect_equal(
+ case_when(
+ logical() ~ 1,
+ logical() ~ 2
+ ),
+ numeric()
+ )
+})
diff --git a/tests/testthat/test-coalesce.R b/tests/testthat/test-coalesce.R
index 5f54f6e..0458554 100644
--- a/tests/testthat/test-coalesce.R
+++ b/tests/testthat/test-coalesce.R
@@ -14,5 +14,9 @@ test_that("finds non-missing values in multiple positions", {
})
test_that("error if invalid length", {
- expect_error(coalesce(1:2, 1:3), "Vector 1 is length 3 not 1 or 2")
+ expect_error(
+ coalesce(1:2, 1:3),
+ "Argument 2 must be length 2 (length of `x`) or one, not 3",
+ fixed = TRUE
+ )
})
diff --git a/tests/testthat/test-colwise-arrange.R b/tests/testthat/test-colwise-arrange.R
new file mode 100644
index 0000000..bf9d099
--- /dev/null
+++ b/tests/testthat/test-colwise-arrange.R
@@ -0,0 +1,13 @@
+context("colwise arrange")
+
+df <- mtcars[1:3]
+
+test_that("scoped arrange is identical to manual arrange", {
+ expect_identical(arrange_all(df), arrange(df, mpg, cyl, disp))
+ expect_identical(arrange_at(df, vars(mpg)), arrange(df, mpg))
+ expect_identical(arrange_if(iris, is.factor), arrange(iris, Species))
+})
+
+test_that(".funs is applied to variables before sorting", {
+ expect_identical(arrange_all(df, `-`), arrange(df, -mpg, -cyl, -disp))
+})
diff --git a/tests/testthat/test-colwise-filter.R b/tests/testthat/test-colwise-filter.R
new file mode 100644
index 0000000..b471332
--- /dev/null
+++ b/tests/testthat/test-colwise-filter.R
@@ -0,0 +1,31 @@
+context("colwise filter")
+
+test_that("filter_if()", {
+ expect_identical(nrow(filter_if(mtcars, is_integerish, all_vars(. > 1))), 0L)
+ expect_identical(nrow(filter_if(mtcars, is_integerish, all_vars(. > 0))), 7L)
+})
+
+test_that("filter_at()", {
+ sepal_large <- filter_at(iris, vars(starts_with("Sepal")), all_vars(. > 4))
+ expect_equal(sepal_large$Sepal.Length, c(5.7, 5.2, 5.5))
+})
+
+test_that("filter_all()", {
+ expect_identical(filter_all(mtcars, any_vars(. > 200))$disp, mtcars$disp[mtcars$disp > 200])
+})
+
+test_that("aborts on empty selection", {
+ expect_error(
+ filter_if(mtcars, is_character, all_vars(. > 0)),
+ "`.predicate` has no matching columns",
+ fixed = TRUE
+ )
+})
+
+test_that("aborts when supplied funs()", {
+ expect_error(
+ filter_all(mtcars, funs(. > 0)),
+ "`.vars_predicate` must be a call to `all_vars()` or `any_vars()`, not list",
+ fixed = TRUE
+ )
+})
diff --git a/tests/testthat/test-colwise-group-by.R b/tests/testthat/test-colwise-group-by.R
new file mode 100644
index 0000000..d60d611
--- /dev/null
+++ b/tests/testthat/test-colwise-group-by.R
@@ -0,0 +1,16 @@
+context("colwise group_by")
+
+test_that("group_by_ verbs take scoped inputs", {
+ expect_identical(group_vars(group_by_all(mtcars)), names(mtcars))
+ expect_identical(group_vars(group_by_at(mtcars, vars(starts_with("d")))), c("disp", "drat"))
+ expect_identical(group_vars(group_by_if(iris, is.factor)), "Species")
+})
+
+test_that("group_by_ verbs accept optional operations", {
+ df <- data_frame(x = 1:2, y = 2:3)
+ gdf <- group_by(mutate_all(df, as.factor), x, y)
+
+ expect_identical(group_by_all(df, as.factor), gdf)
+ expect_identical(group_by_if(df, is_integer, as.factor), gdf)
+ expect_identical(group_by_at(df, vars(x:y), as.factor), gdf)
+})
diff --git a/tests/testthat/test-colwise-mutate.R b/tests/testthat/test-colwise-mutate.R
new file mode 100644
index 0000000..d0af580
--- /dev/null
+++ b/tests/testthat/test-colwise-mutate.R
@@ -0,0 +1,133 @@
+context("colwise mutate/summarise")
+
+test_that("funs found in current environment", {
+ f <- function(x) 1
+ df <- data.frame(x = c(2:10, 1000))
+
+ out <- summarise_all(df, funs(f, mean, median))
+ expect_equal(out, data.frame(f = 1, mean = 105.4, median = 6.5))
+})
+
+test_that("can use character vectors", {
+ df <- data.frame(x = 1:3)
+
+ expect_equal(summarise_all(df, "mean"), summarise_all(df, funs(mean)))
+ expect_equal(mutate_all(df, list(mean = "mean")), mutate_all(df, funs(mean = mean)))
+})
+
+test_that("can use bare functions", {
+ df <- data.frame(x = 1:3)
+
+ expect_equal(summarise_all(df, mean), summarise_all(df, funs(mean)))
+ expect_equal(mutate_all(df, mean), mutate_all(df, funs(mean)))
+})
+
+test_that("default names are smallest unique set", {
+ df <- data.frame(x = 1:3, y = 1:3)
+
+ expect_named(summarise_at(df, vars(x:y), funs(mean)), c("x", "y"))
+ expect_named(summarise_at(df, vars(x), funs(mean, sd)), c("mean", "sd"))
+ expect_named(summarise_at(df, vars(x:y), funs(mean, sd)), c("x_mean", "y_mean", "x_sd", "y_sd"))
+ expect_named(summarise_at(df, vars(x:y), funs(base::mean, stats::sd)), c("x_base::mean", "y_base::mean", "x_stats::sd", "y_stats::sd"))
+})
+
+test_that("named arguments force complete named", {
+ df <- data.frame(x = 1:3, y = 1:3)
+ expect_named(summarise_at(df, vars(x:y), funs(mean = mean)), c("x_mean", "y_mean"))
+ expect_named(summarise_at(df, vars(x = x), funs(mean, sd)), c("x_mean", "x_sd"))
+})
+
+expect_classes <- function(tbl, expected) {
+ classes <- unname(map_chr(tbl, class))
+ classes <- paste0(substring(classes, 0, 1), collapse = "")
+ expect_equal(classes, expected)
+}
+
+test_that("can select colwise", {
+ columns <- iris %>% mutate_at(vars(starts_with("Petal")), as.character)
+ expect_classes(columns, "nnccf")
+
+ numeric <- iris %>% mutate_at(c(1, 3), as.character)
+ expect_classes(numeric, "cncnf")
+
+ character <- iris %>% mutate_at("Species", as.character)
+ expect_classes(character, "nnnnc")
+})
+
+test_that("can probe colwise", {
+ predicate <- iris %>% mutate_if(is.factor, as.character)
+ expect_classes(predicate, "nnnnc")
+
+ logical <- iris %>% mutate_if(c(TRUE, FALSE, TRUE, TRUE, FALSE), as.character)
+ expect_classes(logical, "cnccf")
+})
+
+test_that("non syntactic colnames work", {
+ df <- data_frame(`x 1` = 1:3)
+ expect_identical(summarise_at(df, "x 1", sum)[[1]], 6L)
+ expect_identical(summarise_if(df, is.numeric, sum)[[1]], 6L)
+ expect_identical(summarise_all(df, sum)[[1]], 6L)
+ expect_identical(mutate_all(df, `*`, 2)[[1]], (1:3) * 2)
+})
+
+test_that("empty selection does not select everything (#2009, #1989)", {
+ expect_equal(mtcars, mutate_if(mtcars, is.factor, as.character))
+})
+
+test_that("error is thrown with improper additional arguments", {
+ # error messages by base R, not checked
+ expect_error(mutate_all(mtcars, round, 0, 0))
+ expect_error(mutate_all(mtcars, mean, na.rm = TRUE, na.rm = TRUE))
+})
+
+test_that("predicate can be quoted", {
+ expected <- mutate_if(mtcars, is_integerish, mean)
+ expect_identical(mutate_if(mtcars, "is_integerish", mean), expected)
+ expect_identical(mutate_if(mtcars, ~is_integerish(.x), mean), expected)
+})
+
+test_that("transmute verbs do not retain original variables", {
+ expect_named(transmute_all(data_frame(x = 1:3, y = 1:3), funs(mean, sd)), c("x_mean", "y_mean", "x_sd", "y_sd"))
+ expect_named(transmute_if(data_frame(x = 1:3, y = 1:3), is_integer, funs(mean, sd)), c("x_mean", "y_mean", "x_sd", "y_sd"))
+ expect_named(transmute_at(data_frame(x = 1:3, y = 1:3), vars(x:y), funs(mean, sd)), c("x_mean", "y_mean", "x_sd", "y_sd"))
+})
+
+test_that("can rename with vars() (#2594)", {
+ expect_equal(mutate_at(tibble(x = 1:3), vars(y = x), mean), tibble(x = 1:3, y = c(2, 2, 2)))
+})
+
+test_that("selection works with grouped data frames (#2624)", {
+ gdf <- group_by(iris, Species)
+ expect_identical(mutate_if(gdf, is.factor, as.character), gdf)
+})
+
+test_that("at selection works even if not all ops are named (#2634)", {
+ df <- tibble(x = 1, y = 2)
+ expect_identical(mutate_at(df, vars(z = x, y), funs(. + 1)), tibble(x = 1, y = 3, z = 2))
+})
+
+test_that("can use a purrr-style lambda", {
+ expect_identical(summarise_at(mtcars, vars(1:2), ~mean(.x)), summarise(mtcars, mpg = mean(mpg), cyl = mean(cyl)))
+})
+
+
+# Deprecated ---------------------------------------------------------
+
+test_that("_each() and _all() families agree", {
+ df <- data.frame(x = 1:3, y = 1:3)
+
+ expect_equal(summarise_each(df, funs(mean)), summarise_all(df, mean))
+ expect_equal(summarise_each(df, funs(mean), x:y), summarise_at(df, vars(x:y), mean))
+ expect_equal(summarise_each(df, funs(mean), z = y), summarise_at(df, vars(z = y), mean))
+
+ expect_equal(mutate_each(df, funs(mean)), mutate_all(df, mean))
+ expect_equal(mutate_each(df, funs(mean), x:y), mutate_at(df, vars(x:y), mean))
+ expect_equal(mutate_each(df, funs(mean), z = y), mutate_at(df, vars(z = y), mean))
+})
+
+test_that("specific directions are given for _all() and _at() versions", {
+ summarise_each(mtcars, funs(mean))
+ summarise_each(mtcars, funs(mean), cyl)
+ mutate_each(mtcars, funs(mean))
+ mutate_each(mtcars, funs(mean), cyl)
+})
diff --git a/tests/testthat/test-colwise-select.R b/tests/testthat/test-colwise-select.R
new file mode 100644
index 0000000..c72a27a
--- /dev/null
+++ b/tests/testthat/test-colwise-select.R
@@ -0,0 +1,73 @@
+context("colwise select")
+
+df <- data_frame(x = 0L, y = 0.5, z = 1)
+
+test_that("can select/rename all variables", {
+ expect_identical(select_all(df), df)
+ expect_error(
+ rename_all(df),
+ "`.funs` must specify a renaming function",
+ fixed = TRUE
+ )
+
+ expect_identical(select_all(df, toupper), set_names(df, c("X", "Y", "Z")))
+ expect_identical(select_all(df, toupper), rename_all(df, toupper))
+})
+
+test_that("can select/rename with predicate", {
+ expect_identical(select_if(df, is_integerish), select(df, x, z))
+ expect_error(
+ rename_if(df, is_integerish),
+ "`.funs` must specify a renaming function",
+ fixed = TRUE
+ )
+
+ expect_identical(select_if(df, is_integerish, toupper), set_names(df[c("x", "z")], c("X", "Z")))
+ expect_identical(rename_if(df, is_integerish, toupper), set_names(df, c("X", "y", "Z")))
+})
+
+test_that("can supply funs()", {
+ expect_identical(select_if(df, funs(is_integerish(.)), funs(toupper(.))), set_names(df[c("x", "z")], c("X", "Z")))
+ expect_identical(rename_if(df, funs(is_integerish(.)), funs(toupper(.))), set_names(df, c("X", "y", "Z")))
+})
+
+test_that("fails when more than one renaming function is supplied", {
+ expect_error(
+ select_all(df, funs(tolower, toupper)),
+ "`.funs` must contain one renaming function, not 2",
+ fixed = TRUE
+ )
+ expect_error(
+ rename_all(df, funs(tolower, toupper)),
+ "`.funs` must contain one renaming function, not 2",
+ fixed = TRUE
+ )
+})
+
+test_that("can select/rename with vars()", {
+ expect_identical(select_at(df, vars(x:y)), df[-3])
+ expect_error(
+ rename_at(df, vars(x:y)),
+ "`.funs` must specify a renaming function",
+ fixed = TRUE
+ )
+
+ expect_identical(select_at(df, vars(x:y), toupper), set_names(df[-3], c("X", "Y")))
+ expect_identical(rename_at(df, vars(x:y), toupper), set_names(df, c("X", "Y", "z")))
+})
+
+test_that("select_if keeps grouping cols", {
+ expect_silent(df <- iris %>% group_by(Species) %>% select_if(is.numeric))
+ expect_equal(df, tbl_df(iris[c(5, 1:4)]))
+})
+
+test_that("select_if() handles non-syntactic colnames", {
+ df <- data_frame(`x 1` = 1:3)
+ expect_identical(select_if(df, is_integer)[[1]], 1:3)
+})
+
+test_that("select_if() handles quoted predicates", {
+ expected <- select_if(mtcars, is_integerish)
+ expect_identical(select_if(mtcars, "is_integerish"), expected)
+ expect_identical(select_if(mtcars, ~is_integerish(.x)), expected)
+})
diff --git a/tests/testthat/test-colwise.R b/tests/testthat/test-colwise.R
deleted file mode 100644
index 581c741..0000000
--- a/tests/testthat/test-colwise.R
+++ /dev/null
@@ -1,86 +0,0 @@
-context("colwise")
-
-test_that("funs found in current environment", {
- f <- function(x) 1
- df <- data.frame(x = c(2:10, 1000))
-
- out <- summarise_all(df, funs(f, mean, median))
- expect_equal(out, data.frame(f = 1, mean = 105.4, median = 6.5))
-})
-
-test_that("can use character vectors", {
- df <- data.frame(x = 1:3)
-
- expect_equal(summarise_all(df, "mean"), summarise_all(df, funs(mean)))
- expect_equal(mutate_all(df, c(mean = "mean")), mutate_all(df, funs(mean = mean)))
-})
-
-test_that("can use bare functions", {
- df <- data.frame(x = 1:3)
-
- expect_equal(summarise_all(df, mean), summarise_all(df, funs(mean)))
- expect_equal(mutate_all(df, mean), mutate_all(df, funs(mean)))
-})
-
-test_that("default names are smallest unique set", {
- df <- data.frame(x = 1:3, y = 1:3)
-
- expect_named(summarise_at(df, vars(x:y), funs(mean)), c("x", "y"))
- expect_named(summarise_at(df, vars(x), funs(mean, sd)), c("mean", "sd"))
- expect_named(summarise_at(df, vars(x:y), funs(mean, sd)), c("x_mean", "y_mean", "x_sd", "y_sd"))
-})
-
-test_that("named arguments force complete namd", {
- df <- data.frame(x = 1:3, y = 1:3)
- expect_named(summarise_at(df, vars(x:y), funs(mean = mean)), c("x_mean", "y_mean"))
- expect_named(summarise_at(df, vars(x = x), funs(mean, sd)), c("x_mean", "x_sd"))
-})
-
-test_that("additional arguments are merged into funs calls", {
- multiple <- funs_(c("mean", "median"), list(na.rm = TRUE))
- expect_true(multiple[[1]]$expr$na.rm)
- expect_true(multiple[[2]]$expr$na.rm)
-
- overwritten <- funs_(quote(mean(trim = 10)), list(trim = 1))
- expect_equal(overwritten[[1]]$expr$trim, 1)
-})
-
-expect_classes <- function(tbl, expected) {
- classes <- unname(vapply(tbl, class, character(1)))
- classes <- paste0(substring(classes, 0, 1), collapse = "")
- expect_equal(classes, expected)
-}
-
-test_that("can select colwise", {
- columns <- iris %>% mutate_at(vars(starts_with("Petal")), as.character)
- expect_classes(columns, "nnccf")
-
- numeric <- iris %>% mutate_at(c(1, 3), as.character)
- expect_classes(numeric, "cncnf")
-
- character <- iris %>% mutate_at("Species", as.character)
- expect_classes(character, "nnnnc")
-})
-
-test_that("can probe colwise", {
- predicate <- iris %>% mutate_if(is.factor, as.character)
- expect_classes(predicate, "nnnnc")
-
- logical <- iris %>% mutate_if(c(TRUE, FALSE, TRUE, TRUE, FALSE), as.character)
- expect_classes(logical, "cnccf")
-})
-
-test_that("sql sources fail with bare functions", {
- expect_error(memdb_frame(x = 1) %>% mutate_all(mean) %>% collect())
-})
-
-
-# Deprecated ---------------------------------------------------------
-
-test_that("summarise_each() and summarise_all() agree", {
- df <- data.frame(x = 1:3, y = 1:3)
-
- expect_equal(summarise_each(df, funs(mean)), summarise_all(df, mean))
- expect_equal(summarise_each(df, funs(mean), x:y), summarise_at(df, vars(x:y), mean))
- expect_equal(summarise_each(df, funs(mean), z = y), summarise_at(df, vars(z = y), mean))
-})
diff --git a/tests/testthat/test-combine.R b/tests/testthat/test-combine.R
index e9b17c7..5ad3aed 100644
--- a/tests/testthat/test-combine.R
+++ b/tests/testthat/test-combine.R
@@ -1,18 +1,217 @@
context("combine")
test_that("combine handles NULL (1596)", {
- expect_equal( combine(list(NULL, 1,2)), c(1,2) )
- expect_equal( combine(list(1,NULL,2)), c(1,2) )
- expect_equal( combine(list(1,2,NULL)), c(1,2) )
- expect_error( combine(list(NULL,NULL)))
+ expect_equal(combine(list(NULL, 1, 2)), c(1, 2))
+ expect_equal(combine(list(1, NULL, 2)), c(1, 2))
+ expect_equal(combine(list(1, 2, NULL)), c(1, 2))
+ expect_error(
+ combine(list(NULL, NULL)),
+ "no data to combine, all elements are NULL",
+ fixed = TRUE
+ )
})
test_that("combine complains about incompatibilites", {
- expect_error(combine("a", 1), "character to numeric")
- expect_error(combine(factor("a"), 1L), "factor to integer")
+ expect_error(
+ combine("a", 1),
+ "Argument 2 can't be converted from numeric to character"
+ )
+ expect_error(
+ combine(factor("a"), 1L),
+ "Argument 2 can't be converted from integer to factor"
+ )
})
test_that("combine works with input that used to fail (#1780)", {
no <- list(alpha = letters[1:3], omega = letters[24:26])
expect_equal(combine(no), unlist(no, use.names = FALSE))
})
+
+test_that("combine works with NA and logical (#2203)", {
+ # NA first
+ expected_result <- c(NA, TRUE, FALSE, NA, TRUE)
+ works1 <- combine(list(NA, TRUE, FALSE, NA, TRUE))
+ expect_equal(works1, expected_result)
+
+ # NA length == 1
+ expected_result <- c(TRUE, FALSE, NA, TRUE)
+ works1 <- combine(list(TRUE, FALSE, NA, TRUE))
+ expect_equal(works1, expected_result)
+
+ # NA length > 1
+ expected_result <- c(TRUE, FALSE, NA, NA, TRUE)
+ works3 <- combine(list(TRUE, FALSE, c(NA, NA), TRUE))
+ expect_equal(works3, expected_result)
+
+})
+
+test_that("combine works with NA and integers (#2203)", {
+ works <- combine(list(1L, 2L, NA, 4L))
+ expect_equal(works, c(1L, 2L, NA, 4L))
+ works <- combine(list(1L, 2L, c(NA, NA), 4L))
+ expect_equal(works, c(1L, 2L, NA, NA, 4L))
+})
+
+test_that("combine works with NA and factors (#2203)", {
+ # NA first
+ fac <- factor(c("a", "c", NA, "b"), levels = letters[1:3])
+ expected_result <- fac[c(3, 1, 3, 2)]
+ works1 <- combine(list(NA, fac[1], NA, fac[2]))
+ expect_equal(works1, expected_result)
+
+ # NA length == 1
+ expected_result <- fac
+ works1 <- combine(list(fac[1], fac[2], fac[3], fac[4]))
+ expect_equal(works1, expected_result)
+
+ works2 <- combine(list(fac[1], fac[2], NA, fac[4]))
+ expect_equal(works2, expected_result)
+
+ # NA length > 1
+ expected_result <- fac[c(1, 2, 3, 3, 4)]
+
+ works3 <- combine(list(fac[1], fac[2], fac[c(3, 3)], fac[4]))
+ expect_equal(works3, expected_result)
+
+ works4 <- combine(list(fac[1], fac[2], c(NA, NA), fac[4]))
+ expect_equal(works4, expected_result)
+})
+
+test_that("combine works with NA and double (#2203)", {
+ # NA first
+ works <- combine(list(NA, 1.5, 2.5, NA, 4.5))
+ expect_equal(works, c(NA, 1.5, 2.5, NA, 4.5))
+ # NA length 1
+ works <- combine(list(1.5, 2.5, NA, 4.5))
+ expect_equal(works, c(1.5, 2.5, NA, 4.5))
+ # NA length > 1
+ works <- combine(list(1.5, 2.5, c(NA, NA), 4.5))
+ expect_equal(works, c(1.5, 2.5, NA, NA, 4.5))
+})
+
+test_that("combine works with NA and characters (#2203)", {
+ # NA first
+ works <- combine(list(NA, "a", "b", "c", NA, "e"))
+ expect_equal(works, c(NA, "a", "b", "c", NA, "e"))
+ # NA length 1
+ works <- combine(list("a", "b", "c", NA, "e"))
+ expect_equal(works, c("a", "b", "c", NA, "e"))
+ # NA length > 1
+ works <- combine(list("a", "b", "c", c(NA, NA), "e"))
+ expect_equal(works, c("a", "b", "c", NA, NA, "e"))
+})
+
+
+test_that("combine works with NA and POSIXct (#2203)", {
+ # NA first
+ works <- combine(list(NA, as.POSIXct("2010-01-01"), as.POSIXct("2010-01-02"),
+ NA, as.POSIXct("2010-01-04")))
+ expect_equal(works, c(as.POSIXct(c(NA, "2010-01-01", "2010-01-02",
+ NA, "2010-01-04"))))
+ # NA length 1
+ works <- combine(list(as.POSIXct("2010-01-01"), as.POSIXct("2010-01-02"),
+ NA, as.POSIXct("2010-01-04")))
+ expect_equal(works, c(as.POSIXct(c("2010-01-01", "2010-01-02",
+ NA, "2010-01-04"))))
+ # NA length > 1
+ works <- combine(list(as.POSIXct("2010-01-01"), as.POSIXct("2010-01-02"),
+ c(NA, NA), as.POSIXct("2010-01-04")))
+ expect_equal(works, c(as.POSIXct(c("2010-01-01", "2010-01-02",
+ NA, NA, "2010-01-04"))))
+})
+
+test_that("combine works with NA and Date (#2203)", {
+ # NA first
+ expected_result <- as.Date("2010-01-01") + c(NA, 1, 2, NA, 4)
+ expect_equal(combine(as.list(expected_result)), expected_result)
+
+ # NA length == 1
+ expected_result <- c(as.Date(c("2010-01-01", "2010-01-02", NA, "2010-01-04")))
+ works1 <- combine(list(as.Date("2010-01-01"), as.Date("2010-01-02"),
+ as.Date(NA), as.Date("2010-01-04")))
+ expect_equal(works1, expected_result)
+
+ works2 <- combine(list(as.Date("2010-01-01"), as.Date("2010-01-02"),
+ NA, as.Date("2010-01-04")))
+ expect_equal(works2, expected_result)
+
+ # NA length > 1
+ expected_result <- as.Date("2010-01-01") + c(0, 1, NA, NA, 3)
+ works1 <- combine(split(expected_result, c(1, 2, 3, 3, 4)))
+ expect_equal(works1, expected_result)
+
+ works2 <- combine(list(as.Date("2010-01-01"), as.Date("2010-01-02"),
+ c(NA, NA),
+ as.Date("2010-01-04")))
+ expect_equal(works2, expected_result)
+})
+
+
+test_that("combine works with NA and complex (#2203)", {
+ # NA first
+ expected_result <- c(NA, 1 + 2i)
+ works1 <- combine(list(NA, 1 + 2i))
+ expect_equal(works1, expected_result)
+
+ # NA length == 1
+ expected_result <- c(1, 2, NA, 4) + 1i
+
+ expect_equal(combine(as.list(expected_result)), expected_result)
+
+ works2 <- combine(list(1 + 1i, 2 + 1i, NA, 4 + 1i))
+ expect_equal(works2, expected_result)
+
+ # NA length > 1
+ expected_result <- c(1, 2, NA, NA, 4) + 1i
+ expect_equal(combine(split(expected_result, c(1, 2, 3, 3, 4))),
+ expected_result)
+
+ works3 <- combine(list(1 + 1i, 2 + 1i, c(NA, NA), 4 + 1i))
+ expect_equal(works3, expected_result)
+})
+
+test_that("combine works with integer64 (#1092)", {
+ expect_equal(
+ combine(bit64::as.integer64(2^34), bit64::as.integer64(2^35)),
+ bit64::as.integer64(c(2^34, 2^35))
+ )
+})
+
+test_that("combine works with difftime", {
+ expect_equal(
+ combine(as.difftime(1, units = "mins"), as.difftime(1, units = "hours")),
+ as.difftime(c(60, 3600), units = "secs")
+ )
+ expect_equal(
+ combine(as.difftime(1, units = "secs"), as.difftime(1, units = "secs")),
+ as.difftime(c(1, 1), units = "secs")
+ )
+ expect_equal(
+ combine(as.difftime(1, units = "days"), as.difftime(1, units = "secs")),
+ as.difftime(c(24*60*60, 1), units = "secs")
+ )
+ expect_equal(
+ combine(as.difftime(2, units = "weeks"), as.difftime(1, units = "secs")),
+ as.difftime(c(2*7*24*60*60, 1), units = "secs")
+ )
+ expect_equal(
+ combine(as.difftime(2, units = "weeks"), as.difftime(3, units = "weeks")),
+ as.difftime(c(2,3), units = "weeks")
+ )
+
+})
+
+test_that("combine works with hms and difftime", {
+ expect_equal(
+ combine(as.difftime(2, units = "weeks"), hms::hms(hours = 1)),
+ as.difftime(c(2*7*24*60*60, 3600), units = "secs")
+ )
+ expect_equal(
+ combine(hms::hms(hours = 1), as.difftime(2, units = "weeks")),
+ hms::hms(seconds = c(3600, 2*7*24*60*60))
+ )
+
+})
+
+# Uses helper-combine.R
+combine_coercion_types()
diff --git a/tests/testthat/test-compute.R b/tests/testthat/test-compute.R
deleted file mode 100644
index 128db7c..0000000
--- a/tests/testthat/test-compute.R
+++ /dev/null
@@ -1,34 +0,0 @@
-context("Compute")
-
-test_that("compute doesn't change representation", {
- tbls <- test_frame(x = 5:1, y = 1:5, z = "a")
-
- compare_tbls(tbls, . %>% compute, convert = TRUE)
- compare_tbls(tbls, . %>% mutate(a = x) %>% compute, convert = TRUE)
-})
-
-test_that("compute can create indexes", {
- tbls <- test_frame(x = 5:1, y = 1:5, z = "a")
-
- compare_tbls(tbls, . %>% compute(indexes = c("x", "y")),
- compare = equal_data_frame, convert = TRUE)
- compare_tbls(tbls, . %>% compute(indexes = list("x", "y", c("x", "y"))),
- compare = equal_data_frame, convert = TRUE)
- compare_tbls(tbls, . %>% compute(indexes = "x", unique_indexes = "y"),
- compare = equal_data_frame, convert = TRUE)
- compare_tbls(tbls, . %>% compute(unique_indexes = c("x", "y")),
- compare = equal_data_frame, convert = TRUE)
- compare_tbls(tbls, . %>% compute(unique_indexes = list(c("x", "z"), c("y", "z"))),
- compare = equal_data_frame, convert = TRUE)
- # FIXME: Reenable Postgres when it starts throwing errors on execution
- # failures
- compare_tbls(
- tbls[!(names(tbls) %in% c("df", "postgres"))],
- function(tbl) {
- expect_error(compute(tbl, unique_indexes = "z"), ".")
- expect_error(compute(tbl, indexes = "x", unique_indexes = "z"))
- data_frame()
- },
- ref = data_frame()
- )
-})
diff --git a/tests/testthat/test-copy_to.R b/tests/testthat/test-copy_to.R
new file mode 100644
index 0000000..1f24b84
--- /dev/null
+++ b/tests/testthat/test-copy_to.R
@@ -0,0 +1,66 @@
+context("copy_to")
+
+test_that("src_local only overwrites if overwrite = TRUE", {
+ env <- new.env(parent = emptyenv())
+ env$x <- 1
+
+ src_env <- src_df(env = env)
+
+ expect_error(
+ copy_to(src_env, tibble(x = 1), name = "x"),
+ "object with `name` = `x` must not already exist, unless `overwrite` = TRUE",
+ fixed = TRUE
+ )
+
+ df <- tibble(x = 1)
+ copy_to(src_env, df, name = "x", overwrite = TRUE)
+ expect_equal(env$x, df)
+})
+
+test_that("src_local errs with pkg/env", {
+ expect_error(
+ src_df("base", new.env()),
+ "Exactly one of `pkg` and `env` must be non-NULL, not 2",
+ fixed = TRUE
+ )
+
+ expect_error(
+ src_df(),
+ "Exactly one of `pkg` and `env` must be non-NULL, not 0",
+ fixed = TRUE
+ )
+})
+
+test_that("auto_copy() requires same source", {
+ skip_if_not_installed("dbplyr")
+
+ env <- new.env(parent = emptyenv())
+ env$iris <- iris
+ src_iris <- src_df(env = env)
+
+ src_mtcars <- src_sqlite(":memory:", create = TRUE)
+ copy_to(src_mtcars, mtcars, "mtcars")
+
+ expect_error(
+ auto_copy(tbl(src_iris, "iris"), src_mtcars, name = "iris"),
+ "`x` and `y` must share the same src, set `copy` = TRUE (may be slow)",
+ fixed = TRUE
+ )
+
+ expect_error(
+ auto_copy(tbl(src_mtcars, "mtcars"), src_iris, name = "mtcars"),
+ "`x` and `y` must share the same src, set `copy` = TRUE (may be slow)",
+ fixed = TRUE
+ )
+})
+
+test_that("src_sqlite() errs if path does not exist", {
+ skip_if_not_installed("dbplyr")
+
+ expect_error(
+ src_sqlite(":memory:"),
+ "`path` must not already exist, unless `create` = TRUE",
+ fixed = TRUE
+ )
+})
+
diff --git a/tests/testthat/test-count-tally.r b/tests/testthat/test-count-tally.r
new file mode 100644
index 0000000..022eab2
--- /dev/null
+++ b/tests/testthat/test-count-tally.r
@@ -0,0 +1,107 @@
+context("count-tally")
+
+# count -------------------------------------------------------------------
+
+test_that("can count variable called n", {
+ df <- data.frame(n = c(1, 1, 2, 2, 2))
+
+ out <- df %>% count(n)
+ expect_equal(names(out), c("n", "nn"))
+ expect_equal(out$nn, c(2, 3))
+
+ out <- df %>% count(n, sort = TRUE)
+ expect_equal(out$nn, c(3, 2))
+})
+
+test_that("count preserves grouping of input", {
+ df <- data.frame(g = c(1, 2, 2, 2))
+
+ out1 <- count(df, g)
+ expect_equal(group_vars(out1), character())
+
+ df2 <- df %>% group_by(g)
+ out2 <- count(df2)
+ expect_equal(group_vars(out2), "g")
+})
+
+test_that("grouped count includes group", {
+ df <- data.frame(g = c(1, 2, 2, 2))
+
+ res <- df %>% group_by(g) %>% count()
+ expect_equal(names(res), c("g", "n"))
+ expect_equal(res$n, c(1, 3))
+ expect_equal(group_vars(res), "g")
+})
+
+
+# add_count ---------------------------------------------------------------
+
+test_that("can add counts of a variable called n", {
+ df <- data.frame(n = c(1, 1, 2, 2, 2))
+
+ out <- df %>% add_count(n)
+ expect_equal(names(out), c("n", "nn"))
+ expect_equal(out$n, df$n)
+ expect_equal(out$nn, c(2, 2, 3, 3, 3))
+
+ out <- df %>% add_count(n, sort = TRUE)
+ expect_equal(out$nn, c(3, 3, 3, 2, 2))
+})
+
+test_that("add_count respects and preserves existing groups", {
+ df <- data.frame(g = c(1, 2, 2, 2), val = c("b", "b", "b", "c"))
+ res <- df %>% add_count(val)
+ expect_equal(res$n, c(3, 3, 3, 1))
+ expect_no_groups(res)
+
+ res <- df %>% group_by(g) %>% add_count(val)
+ expect_equal(res$n, c(1, 2, 2, 1))
+ expect_groups(res, "g")
+})
+
+
+# tally -------------------------------------------------------------------
+
+test_that("weighted tally drops NAs (#1145)", {
+ df <- data_frame(x = c(1, 1, NA))
+
+ expect_equal(tally(df, x)$n, 2)
+})
+
+
+# add_tally ---------------------------------------------------------------
+
+
+test_that("can add tallies of a variable", {
+ df <- data.frame(a = c(1, 1, 2, 2, 2))
+
+ out <- df %>% group_by(a) %>% add_tally()
+ expect_equal(names(out), c("a", "n"))
+ expect_equal(out$a, df$a)
+ expect_equal(out$n, c(2, 2, 3, 3, 3))
+
+ out <- df %>% group_by(a) %>% add_tally(sort = TRUE)
+ expect_equal(out$n, c(3, 3, 3, 2, 2))
+})
+
+test_that("add_tally respects and preserves existing groups", {
+ df <- data.frame(g = c(1, 2, 2, 2), val = c("b", "b", "b", "c"))
+ res <- df %>% group_by(val) %>% add_tally()
+ expect_equal(res$n, c(3, 3, 3, 1))
+ expect_groups(res, "val")
+
+ res <- df %>% group_by(g, val) %>% add_tally()
+ expect_equal(res$n, c(1, 2, 2, 1))
+ expect_groups(res, c("g", "val"))
+})
+
+test_that("add_tally can be given a weighting variable", {
+ df <- data.frame(a = c(1, 1, 2, 2, 2), w = c(1, 1, 2, 3, 4))
+
+ out <- df %>% group_by(a) %>% add_tally(wt = w)
+ expect_equal(out$n, c(2, 2, 9, 9, 9))
+
+ out <- df %>% group_by(a) %>% add_tally(wt = w + 1)
+ expect_equal(out$n, c(4, 4, 12, 12, 12))
+})
+
diff --git a/tests/testthat/test-data_frame.R b/tests/testthat/test-data_frame.R
index 0104263..37b17b0 100644
--- a/tests/testthat/test-data_frame.R
+++ b/tests/testthat/test-data_frame.R
@@ -4,8 +4,8 @@ context("data_frame")
test_that("add_rownames keeps the tbl classes (#882)", {
expect_warning(
- res <- mtcars %>% add_rownames( "Make&Model" ),
+ res <- mtcars %>% add_rownames("Make&Model"),
"Deprecated"
)
- expect_equal( class(res), c("tbl_df","tbl", "data.frame"))
+ expect_equal(class(res), c("tbl_df", "tbl", "data.frame"))
})
diff --git a/tests/testthat/test-distinct.R b/tests/testthat/test-distinct.R
index 4357e46..f4dc3ca 100644
--- a/tests/testthat/test-distinct.R
+++ b/tests/testthat/test-distinct.R
@@ -1,31 +1,30 @@
context("Distinct")
-df <- data.frame(
- x = c(1, 1, 1, 1),
- y = c(1, 1, 2, 2),
- z = c(1, 2, 1, 2)
-)
-tbls <- test_load(df)
-
test_that("distinct equivalent to local unique when keep_all is TRUE", {
- compare_tbls(tbls, function(x) x %>% distinct(), ref = unique(df))
+ df <- tibble(
+ x = c(1, 1, 1, 1),
+ y = c(1, 1, 2, 2),
+ z = c(1, 2, 1, 2)
+ )
+
+ expect_equal(distinct(df), unique(df))
})
test_that("distinct for single column works as expected (#1937)", {
- compare_tbls(tbls, function(x) x %>% distinct(x, .keep_all = FALSE), ref = df[1, "x", drop= FALSE])
- compare_tbls(tbls, function(x) x %>% distinct(y, .keep_all = FALSE), ref = df[c(1, 3), "y", drop= FALSE])
-})
+ df <- tibble(
+ x = c(1, 1, 1, 1),
+ y = c(1, 1, 2, 2),
+ z = c(1, 2, 1, 2)
+ )
-test_that("distinct throws error if column is specified and .keep_all is TRUE", {
- skip_if_no_sqlite()
- expect_error(collect(distinct(tbls$sqlite, x, .keep_all = TRUE)),
- "specified columns.*[.]keep_all")
+ expect_equal(distinct(df, x, .keep_all = FALSE), unique(df["x"]))
+ expect_equal(distinct(df, y, .keep_all = FALSE), unique(df["y"]))
})
test_that("distinct works for 0-sized columns (#1437)", {
df <- data_frame(x = 1:10) %>% select(-x)
- ddf <- distinct(df, x)
- expect_equal( ncol(ddf), 0L )
+ ddf <- distinct(df)
+ expect_equal(ncol(ddf), 0L)
})
test_that("if no variables specified, uses all", {
@@ -33,11 +32,9 @@ test_that("if no variables specified, uses all", {
expect_equal(distinct(df), data_frame(x = 1, y = 2))
})
-test_that("by default distinct keeps only specified cols", {
- df <- data_frame(x = c(1, 1, 1))
-
+test_that("distinct keeps only specified cols", {
+ df <- data_frame(x = c(1, 1, 1), y = c(1, 1, 1))
expect_equal(df %>% distinct(x), data_frame(x = 1))
- expect_equal(df %>% group_by(x) %>% distinct(), data_frame(x = 1))
})
test_that("unless .keep_all = TRUE", {
@@ -46,3 +43,36 @@ test_that("unless .keep_all = TRUE", {
expect_equal(df %>% distinct(x), data_frame(x = 1))
expect_equal(df %>% distinct(x, .keep_all = TRUE), data_frame(x = 1, y = 3L))
})
+
+test_that("distinct doesn't duplicate columns", {
+ df <- tibble(a = 1:3, b = 4:6)
+
+ expect_named(df %>% distinct(a, a), "a")
+ expect_named(df %>% group_by(a) %>% distinct(a), "a")
+})
+
+
+test_that("grouped distinct always includes group cols", {
+ df <- tibble(g = c(1, 2), x = c(1, 2))
+
+ out <- df %>% group_by(g) %>% distinct(x)
+ expect_equal(df, out)
+})
+
+test_that("empty grouped distinct equivalent to empty ungrouped", {
+ df <- tibble(g = c(1, 2), x = c(1, 2))
+
+ df1 <- df %>% distinct() %>% group_by(g)
+ df2 <- df %>% group_by(g) %>% distinct()
+
+ expect_equal(df1, df2)
+})
+
+test_that("distinct on a new, mutated variable is equivalent to mutate followed by distinct", {
+ df <- tibble(g = c(1, 2), x = c(1, 2))
+
+ df1 <- df %>% distinct(aa = g * 2)
+ df2 <- df %>% mutate(aa = g * 2) %>% distinct(aa)
+
+ expect_equal(df1, df2)
+})
diff --git a/tests/testthat/test-do.R b/tests/testthat/test-do.R
index 5baf431..cb465ac 100644
--- a/tests/testthat/test-do.R
+++ b/tests/testthat/test-do.R
@@ -6,22 +6,27 @@ df <- data.frame(
g = c(1, 2, 2, 3, 3, 3),
x = 1:6,
y = 6:1
-)
-
-tbls <- test_load(df)
-grp <- lapply(tbls, function(x) x %>% group_by(g))
+) %>% group_by(g)
test_that("can't use both named and unnamed args", {
- expect_error(grp$df %>% do(x = 1, 2), "must either be all named or all unnamed")
+ expect_error(
+ df %>% do(x = 1, 2),
+ "Arguments must either be all named or all unnamed",
+ fixed = TRUE
+ )
})
test_that("unnamed elements must return data frames", {
- expect_error(grp$df %>% do(1), "not data frames")
- expect_error(grp$df %>% do("a"), "not data frames")
+ expect_error(
+ df %>% ungroup %>% do(1), "Result must be a data frame, not numeric")
+ expect_error(
+ df %>% do(1), "Results 1, 2, 3 must be data frames, not numeric")
+ expect_error(
+ df %>% do("a"), "Results 1, 2, 3 must be data frames, not character")
})
test_that("unnamed results bound together by row", {
- first <- grp$df %>% do(head(., 1))
+ first <- df %>% do(head(., 1))
expect_equal(nrow(first), 3)
expect_equal(first$g, 1:3)
@@ -29,18 +34,21 @@ test_that("unnamed results bound together by row", {
})
test_that("can only use single unnamed argument", {
- expect_error(grp$df %>% do(head, tail), "single unnamed argument")
+ expect_error(
+ df %>% do(head, tail),
+ "Can only supply one unnamed argument, not 2"
+ )
})
test_that("named argument become list columns", {
- out <- grp$df %>% do(nrow = nrow(.), ncol = ncol(.))
+ out <- df %>% do(nrow = nrow(.), ncol = ncol(.))
expect_equal(out$nrow, list(1, 2, 3))
# includes grouping columns
expect_equal(out$ncol, list(3, 3, 3))
})
test_that("colums in output override columns in input", {
- out <- grp$df %>% do(data.frame(g = 1))
+ out <- df %>% do(data.frame(g = 1))
expect_equal(names(out), "g")
expect_equal(out$g, c(1, 1, 1))
})
@@ -49,8 +57,10 @@ test_that("empty results preserved (#597)", {
blankdf <- function(x) data.frame(blank = numeric(0))
dat <- data.frame(a = 1:2, b = factor(1:2))
- dat %>% group_by(b) %>% do(blankdf(.))
-
+ expect_equal(
+ dat %>% group_by(b) %>% do(blankdf(.)),
+ data.frame(b = factor(integer(), levels = 1:2), blank = numeric())
+ )
})
test_that("empty inputs give empty outputs (#597)", {
@@ -140,52 +150,7 @@ test_that("empty data frames give consistent outputs", {
expect_equal(c(g = "chr", y = "list"))
})
-# SQLite -----------------------------------------------------------------------
-
-test_that("named argument become list columns", {
- skip_if_no_sqlite()
-
- out <- grp$sqlite %>% do(nrow = nrow(.), ncol = ncol(.))
- expect_equal(out$nrow, list(1, 2, 3))
- expect_equal(out$ncol, list(3, 3, 3))
-})
-
-test_that("unnamed results bound together by row", {
- skip_if_no_sqlite()
-
- first <- grp$sqlite %>% do(head(., 1))
-
- expect_equal(nrow(first), 3)
- expect_equal(first$g, 1:3)
- expect_equal(first$x, c(1, 2, 4))
-})
-
-test_that("Results respect select", {
- skip_if_no_sqlite()
-
- smaller <- grp$sqlite %>% select(g, x) %>% do(ncol = ncol(.))
- expect_equal(smaller$ncol, list(2, 2, 2))
-})
-
-test_that("grouping column not repeated", {
- skip_if_no_sqlite()
-
- out <- grp$sqlite %>% do(names = names(.))
- expect_equal(out$names[[1]], c("g", "x", "y"))
-})
-
-test_that("results independent of chunk_size", {
- skip_if_no_sqlite()
- nrows <- function(group, n) {
- unlist(do(group, nrow = nrow(.), .chunk_size = n)$nrow)
- }
-
- expect_equal(nrows(grp$sqlite, 1), c(1, 2, 3))
- expect_equal(nrows(grp$sqlite, 2), c(1, 2, 3))
- expect_equal(nrows(grp$sqlite, 10), c(1, 2, 3))
-})
-
-test_that("handling of empty data frames in do",{
+test_that("handling of empty data frames in do", {
blankdf <- function(x) data.frame(blank = numeric(0))
dat <- data.frame(a = 1:2, b = factor(1:2))
res <- dat %>% group_by(b) %>% do(blankdf(.))
diff --git a/tests/testthat/test-equality.r b/tests/testthat/test-equality.r
index 97425e6..8f73e56 100644
--- a/tests/testthat/test-equality.r
+++ b/tests/testthat/test-equality.r
@@ -19,7 +19,7 @@ test_that("data frames equal to themselves", {
})
test_that("data frames equal to random permutations of themselves", {
- scramble <- function(x){
+ scramble <- function(x) {
x[sample(nrow(x)), sample(ncol(x)), drop = FALSE]
}
@@ -35,36 +35,67 @@ test_that("data frames not equal if missing row", {
})
test_that("data frames not equal if missing col", {
- expect_match(all.equal(tbl_df(mtcars), mtcars[, -1]), "Cols in x but not y: mpg")
- expect_match(all.equal(tbl_df(iris), iris[, -1]), "Cols in x but not y: Sepal.Length")
- expect_match(all.equal(tbl_df(df_all), df_all[, -1]), "Cols in x but not y: a")
+ expect_match(
+ all.equal(tbl_df(mtcars), mtcars[, -1]),
+ "Cols in x but not y: `mpg`"
+ )
+ expect_match(
+ all.equal(tbl_df(iris), iris[, -1]),
+ "Cols in x but not y: `Sepal.Length`"
+ )
+ expect_match(
+ all.equal(tbl_df(df_all), df_all[, -1]),
+ "Cols in x but not y: `a`"
+ )
})
test_that("factors equal only if levels equal", {
- df1 <- data.frame(x = factor(c("a", "b")))
- df2 <- data.frame(x = factor(c("a", "d")))
- expect_match(all.equal(tbl_df(df1), tbl_df(df2)), "Factor levels not equal for column x" )
+ df1 <- data_frame(x = factor(c("a", "b")))
+ df2 <- data_frame(x = factor(c("a", "d")))
+ expect_equal(
+ all.equal(df1, df2),
+ "Factor levels not equal for column `x`"
+ )
+ expect_equal(
+ all.equal(df2, df1),
+ "Factor levels not equal for column `x`"
+ )
+})
+
+test_that("factor comparison requires strict equality of levels (#2440)", {
+ df1 <- data_frame(x = factor("a"))
+ df2 <- data_frame(x = factor("a", levels = c("a", "b")))
+ expect_equal(
+ all.equal(df1, df2),
+ "Factor levels not equal for column `x`"
+ )
+ expect_equal(
+ all.equal(df2, df1),
+ "Factor levels not equal for column `x`"
+ )
+ expect_warning(expect_true(all.equal(df1, df2, convert = TRUE)), "joining factors")
+ expect_warning(expect_true(all.equal(df2, df1, convert = TRUE)), "joining factors")
})
test_that("BoolResult does not overwrite singleton R_TrueValue", {
dplyr:::equal_data_frame(mtcars, mtcars)
- expect_equal( class(2 == 2), "logical" )
+ expect_equal(class(2 == 2), "logical")
})
test_that("all.equal.data.frame handles data.frames with NULL names", {
x <- data.frame(LETTERS[1:3], rnorm(3))
names(x) <- NULL
- expect_true(all.equal(x,x))
+ expect_true(all.equal(x, x))
})
-test_that( "data frame equality test with ignore_row_order=TRUE detects difference in number of rows. #1065", {
+test_that("data frame equality test with ignore_row_order=TRUE detects difference in number of rows. #1065", {
DF1 <- data_frame(a = 1:4, b = letters[1:4])
- DF2 <- data_frame(a = c(1:4,4L), b = letters[c(1:4,4L)])
- expect_false( isTRUE(all.equal(DF1, DF2, ignore_row_order=TRUE)))
+ DF2 <- data_frame(a = c(1:4, 4L), b = letters[c(1:4, 4L)])
+ expect_false(isTRUE(all.equal(DF1, DF2, ignore_row_order = TRUE)))
- DF1 <- data_frame(a = c(1:4,2L), b = letters[c(1:4,2L)])
- DF2 <- data_frame(a = c(1:4,4L), b = letters[c(1:4,4L)])
- expect_false(isTRUE(all.equal(DF1, DF2, ignore_row_order=TRUE)))
+ DF1 <- data_frame(a = c(1:4, 2L), b = letters[c(1:4, 2L)])
+ DF2 <- data_frame(a = c(1:4, 4L), b = letters[c(1:4, 4L)])
+ expect_false(isTRUE(all.equal(DF1, DF2, ignore_row_order = TRUE)))
})
@@ -72,26 +103,29 @@ test_that("all.equal handles NA_character_ correctly. #1095", {
d1 <- data_frame(x = c(NA_character_))
expect_true(all.equal(d1, d1))
- d2 <- data_frame( x = c(NA_character_, "foo", "bar" ) )
+ d2 <- data_frame(x = c(NA_character_, "foo", "bar"))
expect_true(all.equal(d2, d2))
})
-test_that( "handle Date columns of different types, integer and numeric (#1204)", {
+test_that("handle Date columns of different types, integer and numeric (#1204)", {
a <- data.frame(date = as.Date("2015-06-07"))
- b <- data.frame(date = structure( as.integer(a$date), class = "Date" ) )
- expect_true( all.equal(a, b) )
+ b <- data.frame(date = structure(as.integer(a$date), class = "Date"))
+ expect_true(all.equal(a, b))
})
test_that("equality test fails when convert is FALSE and types don't match (#1484)", {
df1 <- data_frame(x = "a")
df2 <- data_frame(x = factor("a"))
- expect_equal( all_equal(df1, df2, convert = FALSE), "Incompatible type for column x: x character, y factor" )
- expect_warning( all_equal(df1, df2, convert = TRUE) )
+ expect_equal(
+ all_equal(df1, df2, convert = FALSE),
+ "Incompatible type for column `x`: x character, y factor"
+ )
+ expect_warning(all_equal(df1, df2, convert = TRUE))
})
test_that("equality handles data frames with 0 rows (#1506)", {
- df0 <- data_frame(x = numeric(0), y = character(0) )
+ df0 <- data_frame(x = numeric(0), y = character(0))
expect_equal(df0, df0)
})
@@ -102,7 +136,11 @@ test_that("equality handles data frames with 0 columns (#1506)", {
test_that("equality cannot be checked in presence of raw columns", {
df <- data_frame(a = 1:3, b = as.raw(1:3))
- expect_error(all.equal(df, df), "Unsupported vector type raw")
+ expect_error(
+ all.equal(df, df),
+ "Column `b` is of unsupported type raw",
+ fixed = TRUE
+ )
})
test_that("equality returns a message for convert = TRUE", {
@@ -118,3 +156,53 @@ test_that("numeric and integer can be compared if convert = TRUE", {
expect_match(all.equal(df1, df2), "Incompatible")
expect_true(all.equal(df1, df2, convert = TRUE))
})
+
+test_that("returns vector for more than one difference (#1819)", {
+ expect_equal(
+ all.equal(data_frame(a = 1, b = 2), data_frame(a = 1L, b = 2L)),
+ c(
+ "Incompatible type for column `a`: x numeric, y integer",
+ "Incompatible type for column `b`: x numeric, y integer"
+ )
+ )
+})
+
+test_that("returns UTF-8 column names (#2441)", {
+ df1 <- data_frame("\u5e78" := 1)
+ df2 <- data_frame("\u798f" := 1)
+
+ expect_equal(
+ all.equal(df1, df2),
+ c(
+ "Cols in y but not x: `\u798f`. ",
+ "Cols in x but not y: `\u5e78`. "
+ ),
+ fixed = TRUE
+ )
+})
+
+test_that("proper message formatting for set operations", {
+ expect_error(
+ union(data_frame(a = 1), data_frame(a = "1")),
+ "not compatible: Incompatible type for column `a`: x numeric, y character",
+ fixed = TRUE
+ )
+
+ expect_error(
+ union(data_frame(a = 1, b = 2), data_frame(a = "1", b = "2")),
+ "not compatible: \n- Incompatible type for column `a`: x numeric, y character\n- Incompatible type for column `b`: x numeric, y character",
+ fixed = TRUE
+ )
+})
+
+test_that("ignore column order", {
+ expect_equal(
+ all.equal(data_frame(a = 1, b = 2), data_frame(b = 2, a = 1), ignore_col_order = FALSE),
+ "Same column names, but different order"
+ )
+
+ expect_equal(
+ all.equal(data_frame(a = 1, b = 2), data_frame(a = 1), ignore_col_order = FALSE),
+ "Cols in x but not y: `b`. "
+ )
+})
diff --git a/tests/testthat/test-equiv-manip.r b/tests/testthat/test-equiv-manip.r
deleted file mode 100644
index 7d69ecc..0000000
--- a/tests/testthat/test-equiv-manip.r
+++ /dev/null
@@ -1,26 +0,0 @@
-context("Equivalence (manip)")
-
-
-test_that("mutate happens before summarise", {
- test_f <- function(tbl) {
- res <- tbl %>%
- mutate(x, z = x + y) %>%
- summarise(sum_z = sum(z)) %>%
- collect()
- expect_equal(res$sum_z, 30)
- }
-
- test_frame(x = 5:1, y = 1:5) %>% lapply(test_f)
-})
-
-test_that("select operates on mutated vars", {
- test_f <- function(tbl) {
- res <- tbl %>%
- mutate(x, z = x + y) %>%
- select(z) %>%
- collect()
- expect_equal(res$z, rep(4, 3))
- }
-
- test_frame(x = 1:3, y = 3:1) %>% lapply(test_f)
-})
diff --git a/tests/testthat/test-filter-windowed.R b/tests/testthat/test-filter-windowed.R
deleted file mode 100644
index 5f6ba7c..0000000
--- a/tests/testthat/test-filter-windowed.R
+++ /dev/null
@@ -1,47 +0,0 @@
-context("Filter - windowed")
-
-test_that("filter calls windowed versions of sql functions", {
- test_f <- function(tbl) {
- res <- tbl %>%
- group_by(g) %>%
- filter(row_number(x) < 3) %>%
- collect()
- expect_equal(res$x, c(1, 2, 6, 7))
- expect_equal(res$g, c(1, 1, 2, 2))
- }
-
- df <- data_frame(x = 1:10, g = rep(c(1, 2), each = 5))
- tbls <- test_load(df, ignore = "sqlite") # SQLite doesn't support window functions
- tbls %>% lapply(test_f)
-})
-
-test_that("recycled aggregates generate window function", {
- test_f <- function(tbl) {
- res <- tbl %>%
- group_by(g) %>%
- filter(x > mean(x)) %>%
- collect()
- expect_equal(res$x, c(4, 5, 9, 10))
- expect_equal(res$g, c(1, 1, 2, 2))
- }
-
- df <- data_frame(x = 1:10, g = rep(c(1, 2), each = 5))
- tbls <- test_load(df, ignore = "sqlite") # SQLite doesn't support window functions
- tbls %>% lapply(test_f)
-})
-
-test_that("cumulative aggregates generate window function", {
- test_f <- function(tbl) {
- res <- tbl %>%
- group_by(g) %>%
- arrange(x) %>%
- filter(cumsum(x) > 3) %>%
- collect()
- expect_equal(res$x, c(3, 3, 4))
- expect_equal(res$g, c(1, 2, 2))
- }
-
- df <- data_frame(x = c(1:3, 2:4), g = rep(c(1, 2), each = 3))
- tbls <- test_load(df, ignore = "sqlite") # SQLite doesn't support window functions
- tbls %>% lapply(test_f)
-})
diff --git a/tests/testthat/test-filter.r b/tests/testthat/test-filter.r
index 8d65d81..e5748b2 100644
--- a/tests/testthat/test-filter.r
+++ b/tests/testthat/test-filter.r
@@ -1,89 +1,79 @@
context("Filter")
-df <- expand.grid(a = 1:10, b = letters[1:10],
- KEEP.OUT.ATTRS = FALSE,
- stringsAsFactors = FALSE)
-
-tbls <- test_load(df)
-
-test_that("filter results independent of data tbl (simple)", {
- skip_if_no_sqlite()
-
- expected <- df[df$a > 6, , drop = FALSE]
- compare_tbls(tbls[c("df","sqlite")], function(x) {
- filter_(x, ~ a > 6)
- }, expected)
-})
-
-test_that("filter captures local variables", {
- sel <- c("d", "g", "a")
- expected <- df[df$b %in% sel, , drop = FALSE]
-
- compare_tbls(tbls, function(x) x %>% filter(b %in% sel), ref = expected)
-})
-
-test_that("two filters equivalent to one", {
- expected <- filter(df, a > 4 & b == "a")
-
- compare_tbls(tbls, function(x) x %>% filter(a > 4) %>% filter(b == "a"),
- ref = expected)
-})
-
test_that("filter fails if inputs incorrect length (#156)", {
- expect_error( filter(tbl_df(mtcars), c(F, T)) )
- expect_error( filter(group_by(mtcars, am), c(F, T)) )
+ expect_error(
+ filter(tbl_df(mtcars), c(F, T)),
+ "Result must have length 32, not 2",
+ fixed = TRUE
+ )
+ expect_error(
+ filter(group_by(mtcars, am), c(F, T)),
+ "Result must have length 19, not 2",
+ fixed = TRUE
+ )
})
test_that("filter gives useful error message when given incorrect input", {
- expect_error( filter(tbl_df(mtcars), x ), "unknown column" )
+ # error message by rlang
+ expect_error(filter(tbl_df(mtcars), `_x`),
+ "_x",
+ fixed = TRUE
+ )
})
test_that("filter complains in inputs are named", {
- expect_error(filter(mtcars, x = 1), "takes unnamed arguments")
- expect_error(filter(mtcars, x = 1 & y > 2), "takes unnamed arguments")
+ expect_error(
+ filter(mtcars, x = 1),
+ "`x` (`x = 1`) must not be named, do you need `==`?",
+ fixed = TRUE
+ )
+ expect_error(
+ filter(mtcars, x = 1 & y > 2),
+ "`x` (`x = 1 & y > 2`) must not be named, do you need `==`?",
+ fixed = TRUE
+ )
})
test_that("filter handles passing ...", {
- df <- data.frame( x = 1:4 )
+ df <- data.frame(x = 1:4)
- f <- function(...){
+ f <- function(...) {
x1 <- 4
f1 <- function(y) y
filter(df, ..., f1(x1) > x)
}
- g <- function(...){
+ g <- function(...) {
x2 <- 2
f(x > x2, ...)
}
res <- g()
- expect_equal( res$x, 3L )
+ expect_equal(res$x, 3L)
- df <- group_by(df,x)
+ df <- group_by(df, x)
res <- g()
- expect_equal( res$x, 3L )
-
+ expect_equal(res$x, 3L)
})
-test_that( "filter handles simple symbols", {
- df <- data.frame( x = 1:4, test = rep(c(T,F), each = 2) )
+test_that("filter handles simple symbols", {
+ df <- data.frame(x = 1:4, test = rep(c(T, F), each = 2))
res <- filter(df, test)
- gdf <- group_by(df,x)
+ gdf <- group_by(df, x)
res <- filter(gdf, test)
- h <- function(data){
- test2 <- c(T,T,F,F)
- filter(data,test2)
+ h <- function(data) {
+ test2 <- c(T, T, F, F)
+ filter(data, test2)
}
- expect_equal(h(df), df[1:2,])
+ expect_equal(h(df), df[1:2, ])
- f <- function(data, ...){
+ f <- function(data, ...) {
one <- 1
- filter( data, test, x > one, ...)
+ filter(data, test, x > one, ...)
}
- g <- function(data, ...){
+ g <- function(data, ...) {
four <- 4
- f( data, x < four, ...)
+ f(data, x < four, ...)
}
res <- g(df)
expect_equal(res$x, 2L)
@@ -96,8 +86,8 @@ test_that( "filter handles simple symbols", {
})
test_that("filter handlers scalar results", {
- expect_equivalent( filter(mtcars, min(mpg)>0 ), mtcars )
- expect_equal( filter(group_by(mtcars,cyl), min(mpg)>0 ), group_by(mtcars,cyl) )
+ expect_equivalent(filter(mtcars, min(mpg) > 0), mtcars)
+ expect_equal(filter(group_by(mtcars, cyl), min(mpg) > 0), group_by(mtcars, cyl))
})
test_that("filter propagates attributes", {
@@ -108,8 +98,16 @@ test_that("filter propagates attributes", {
})
test_that("filter fails on integer indices", {
- expect_error(filter(mtcars, 1:2))
- expect_error(filter(group_by(mtcars,cyl), 1:2))
+ expect_error(
+ filter(mtcars, 1:2),
+ "Argument 2 filter condition does not evaluate to a logical vector",
+ fixed = TRUE
+ )
+ expect_error(
+ filter(group_by(mtcars, cyl), 1:2),
+ "Argument 2 filter condition does not evaluate to a logical vector",
+ fixed = TRUE
+ )
})
test_that("filter discards NA", {
@@ -121,14 +119,14 @@ test_that("filter discards NA", {
expect_equal(nrow(res), 2L)
})
-test_that("date class remains on filter (#273)",{
+test_that("date class remains on filter (#273)", {
x1 <- x2 <- data.frame(
- date = seq.Date(as.Date('2013-01-01'), by = "1 days", length.out = 2),
+ date = seq.Date(as.Date("2013-01-01"), by = "1 days", length.out = 2),
var = c(5, 8)
)
- x1.filter <- x1 %>% filter(as.Date(date) > as.Date('2013-01-01'))
+ x1.filter <- x1 %>% filter(as.Date(date) > as.Date("2013-01-01"))
x2$date <- x2$date + 1
- x2.filter <- x2 %>% filter(as.Date(date) > as.Date('2013-01-01'))
+ x2.filter <- x2 %>% filter(as.Date(date) > as.Date("2013-01-01"))
expect_equal(class(x1.filter$date), "Date")
expect_equal(class(x2.filter$date), "Date")
@@ -146,13 +144,13 @@ test_that("filter handles $ correctly (#278)", {
expect_equal(res1, res2)
})
-test_that( "filter returns the input data if no parameters are given", {
- expect_equivalent( filter(mtcars), mtcars )
+test_that("filter returns the input data if no parameters are given", {
+ expect_equivalent(filter(mtcars), mtcars)
})
-test_that( "$ does not end call traversing. #502", {
+test_that("$ does not end call traversing. #502", {
# Suppose some analysis options are set much earlier in the script
- analysis_opts <- list(min_outcome = .25)
+ analysis_opts <- list(min_outcome = 0.25)
# Generate some dummy data
d <- expand.grid(Subject = 1:3, TrialNo = 1:2, Time = 1:3) %>% tbl_df %>%
@@ -166,22 +164,26 @@ test_that( "$ does not end call traversing. #502", {
left <- filter(trial_outcomes, MeanOutcome < analysis_opts$min_outcome)
right <- filter(trial_outcomes, analysis_opts$min_outcome > MeanOutcome)
- expect_equal(left,right)
+ expect_equal(left, right)
})
-test_that( "GroupedDataFrame checks consistency of data (#606)", {
+test_that("GroupedDataFrame checks consistency of data (#606)", {
df1 <- data_frame(
g = rep(1:2, each = 5),
x = 1:10
) %>% group_by(g)
attr(df1, "group_sizes") <- c(2, 2)
- expect_error(df1 %>% filter(x == 1), "corrupt 'grouped_df'" )
+ expect_error(
+ df1 %>% filter(x == 1),
+ "`.data` is a corrupt grouped_df, contains 10 rows, and 4 rows in groups",
+ fixed = TRUE
+ )
})
-test_that( "filter uses the white list (#566)", {
- datesDF <- read.csv(stringsAsFactors=FALSE, text="
+test_that("filter uses the white list (#566)", {
+ datesDF <- read.csv(stringsAsFactors = FALSE, text = "
X
2014-03-13 16:08:19
2014-03-13 16:16:23
@@ -190,100 +192,97 @@ X
")
datesDF$X <- as.POSIXlt(datesDF$X)
- expect_error(
- filter(datesDF, X > as.POSIXlt("2014-03-13")),
- "column 'X' has unsupported class|POSIXct, not POSIXlt.*'X'"
- )
+ # error message from tibble
+ expect_error(filter(datesDF, X > as.POSIXlt("2014-03-13")))
})
-test_that( "filter handles complex vectors (#436)", {
- d <- data.frame(x=1:10, y=1:10+2i)
- expect_equal(filter(d, x<4)$y, 1:3+2i)
- expect_equal(filter(d, Re(y)<4)$y, 1:3+2i)
+test_that("filter handles complex vectors (#436)", {
+ d <- data.frame(x = 1:10, y = 1:10 + 2i)
+ expect_equal(filter(d, x < 4)$y, 1:3 + 2i)
+ expect_equal(filter(d, Re(y) < 4)$y, 1:3 + 2i)
})
test_that("%in% works as expected (#126)", {
- df <- data_frame( a = c("a", "b", "ab"), g = c(1,1,2) )
+ df <- data_frame(a = c("a", "b", "ab"), g = c(1, 1, 2))
- res <- df %>% filter( a %in% letters )
+ res <- df %>% filter(a %in% letters)
expect_equal(nrow(res), 2L)
- res <- df %>% group_by(g) %>% filter( a %in% letters )
+ res <- df %>% group_by(g) %>% filter(a %in% letters)
expect_equal(nrow(res), 2L)
})
test_that("row_number does not segfault with example from #781", {
- z <- data.frame(a=c(1,2,3))
+ z <- data.frame(a = c(1, 2, 3))
b <- "a"
res <- z %>% filter(row_number(b) == 2)
- expect_equal( nrow(res), 0L )
+ expect_equal(nrow(res), 0L)
})
test_that("filter does not alter expression (#971)", {
- my_filter <- ~ am == 1;
- expect_error( mtcars %>% filter(my_filter) )
- expect_equal( my_filter[[2]][[2]], as.name("am") )
+ my_filter <- ~ am == 1
+ expect_equal(my_filter[[2]][[2]], as.name("am"))
})
test_that("hybrid evaluation handles $ correctly (#1134)", {
- df <- data_frame( x = 1:10, g = rep(1:5, 2 ) )
- res <- df %>% group_by(g) %>% filter( x > min(df$x) )
- expect_equal( nrow(res), 9L )
+ df <- data_frame(x = 1:10, g = rep(1:5, 2))
+ res <- df %>% group_by(g) %>% filter(x > min(df$x))
+ expect_equal(nrow(res), 9L)
})
test_that("filter correctly handles empty data frames (#782)", {
res <- data_frame() %>% filter(F)
- expect_equal( nrow(res), 0L )
- expect_equal( length(names(res)), 0L )
+ expect_equal(nrow(res), 0L)
+ expect_equal(length(names(res)), 0L)
})
test_that("filter(.,TRUE,TRUE) works (#1210)", {
- df <- data.frame(x=1:5)
- res <- filter(df,TRUE,TRUE)
+ df <- data.frame(x = 1:5)
+ res <- filter(df, TRUE, TRUE)
expect_equal(res, df)
})
test_that("filter, slice and arrange preserves attributes (#1064)", {
df <- structure(
- data.frame( x = 1:10, g1 = rep(1:2, each = 5), g2 = rep(1:5, 2) ),
+ data.frame(x = 1:10, g1 = rep(1:2, each = 5), g2 = rep(1:5, 2)),
meta = "this is important"
)
- res <- filter( df, x < 5 ) %>% attr("meta" )
- expect_equal( res, "this is important")
+ res <- filter(df, x < 5) %>% attr("meta")
+ expect_equal(res, "this is important")
- res <- filter( df, x < 5, x > 4) %>% attr("meta" )
- expect_equal( res, "this is important")
+ res <- filter(df, x < 5, x > 4) %>% attr("meta")
+ expect_equal(res, "this is important")
res <- df %>% slice(1:50) %>% attr("meta")
- expect_equal( res, "this is important")
+ expect_equal(res, "this is important")
res <- df %>% arrange(x) %>% attr("meta")
- expect_equal( res, "this is important")
+ expect_equal(res, "this is important")
- res <- df %>% summarise( n() ) %>% attr("meta")
- expect_equal( res, "this is important")
+ res <- df %>% summarise(n()) %>% attr("meta")
+ expect_equal(res, "this is important")
- res <- df %>% group_by(g1) %>% summarise( n() ) %>% attr("meta")
- expect_equal( res, "this is important")
+ res <- df %>% group_by(g1) %>% summarise(n()) %>% attr("meta")
+ expect_equal(res, "this is important")
- res <- df %>% group_by(g1,g2) %>% summarise( n() ) %>% attr("meta")
- expect_equal( res, "this is important")
+ res <- df %>% group_by(g1, g2) %>% summarise(n()) %>% attr("meta")
+ expect_equal(res, "this is important")
})
test_that("filter works with rowwise data (#1099)", {
df <- data_frame(First = c("string1", "string2"), Second = c("Sentence with string1", "something"))
res <- df %>% rowwise() %>% filter(grepl(First, Second, fixed = TRUE))
- expect_equal( nrow(res), 1L)
- expect_equal( df[1,], res)
+ expect_equal(nrow(res), 1L)
+ expect_equal(df[1, ], res)
})
test_that("grouped filter handles indices (#880)", {
- res <- iris %>% group_by(Species) %>% filter( Sepal.Length > 5 )
- res2 <- mutate( res, Petal = Petal.Width * Petal.Length)
- expect_equal( nrow(res), nrow(res2) )
- expect_equal( attr(res, "indices"), attr(res2, "indices") )
+ res <- iris %>% group_by(Species) %>% filter(Sepal.Length > 5)
+ res2 <- mutate(res, Petal = Petal.Width * Petal.Length)
+ expect_equal(nrow(res), nrow(res2))
+ expect_equal(attr(res, "indices"), attr(res2, "indices"))
})
test_that("filter(FALSE) drops indices", {
@@ -291,101 +290,55 @@ test_that("filter(FALSE) drops indices", {
group_by(cyl) %>%
filter(FALSE) %>%
attr("indices")
- expect_equal(out, NULL)
+ expect_identical(out, list())
})
test_that("filter handles S4 objects (#1366)", {
env <- environment()
- Numbers <- suppressWarnings( setClass("Numbers", slots = c(foo = "numeric"), contains = "integer", where = env) )
+ Numbers <- suppressWarnings(setClass(
+ "Numbers", slots = c(foo = "numeric"), contains = "integer", where = env
+ ))
on.exit(removeClass("Numbers", where = env))
- df <- data.frame( x = Numbers( 1:10, foo = 10 ) )
- res <- filter( df, x > 3 )
- expect_true( isS4(res$x) )
- expect_is( res$x, "Numbers")
- expect_equal( res$x at foo, 10)
+ df <- data.frame(x = Numbers(1:10, foo = 10))
+ res <- filter(df, x > 3)
+ expect_true(isS4(res$x))
+ expect_is(res$x, "Numbers")
+ expect_equal(res$x at foo, 10)
})
test_that("hybrid lag and default value for string columns work (#1403)", {
res <- mtcars %>%
- mutate(xx=LETTERS[gear]) %>%
- filter(xx==lag(xx, default='foo'))
- xx <- LETTERS[ mtcars$gear ]
- ok <- xx == lag( xx, default = "foo" )
- expect_equal( xx[ok], res$xx )
+ mutate(xx = LETTERS[gear]) %>%
+ filter(xx == lag(xx, default = "foo"))
+ xx <- LETTERS[mtcars$gear]
+ ok <- xx == lag(xx, default = "foo")
+ expect_equal(xx[ok], res$xx)
res <- mtcars %>%
- mutate(xx=LETTERS[gear]) %>%
- filter(xx==lead(xx, default='foo'))
- xx <- LETTERS[mtcars$gear ]
- ok <- xx == lead( xx, default = "foo" )
- expect_equal( xx[ok], res$xx )
-})
-
-test_that("filter recognizes global #1469", {
- vs <- 4
- res <- mtcars %>% filter(cyl == global(vs))
- expect_true( all(res$cyl == 4) )
-
- res <- mtcars %>% group_by(cyl) %>% filter(cyl == global(vs))
- expect_true( all(res$cyl == 4) )
-})
-
-test_that("filter understands column. #1012", {
- ir1 <- filter( iris, Sepal.Length < 5)
- ir2 <- filter( iris, column("Sepal.Length") < 5)
- ir3 <- filter( iris, column(paste0("Sepal.", "Length")) < 5)
- ir4 <- filter( iris, column(~Sepal.Length) < 5 )
- ir5 <- filter( iris, column(quote(Sepal.Length)) < 5 )
- symb <- quote(Sepal.Length)
- ir6 <- filter( iris, column(symb) < 5 )
- expect_equal(ir1, ir2)
- expect_equal(ir1, ir3)
- expect_equal(ir1, ir4)
- expect_equal(ir1, ir5)
- expect_equal(ir1, ir6)
- expect_error( filter(iris, column(~Sepal.Length + Species) < 5), "unhandled formula in column")
- expect_error( filter(iris, column(~foo) < 5), "expands to a symbol that is not a variable from the data")
- expect_error( filter(iris, column(letters) < 5), "column must return a single string" )
-
- ir1 <- filter( group_by(iris, Species), Sepal.Length < 5)
- ir2 <- filter( group_by(iris, Species), column("Sepal.Length") < 5)
- ir3 <- filter( group_by(iris, Species), column(paste0("Sepal.", "Length")) < 5)
- ir4 <- filter( group_by(iris, Species), column(~Sepal.Length) < 5)
- ir5 <- filter( group_by(iris, Species), column(quote(Sepal.Length)) < 5)
- ir6 <- filter( group_by(iris, Species), column(symb) < 5)
- expect_equal(ir1, ir2)
- expect_equal(ir1, ir3)
- expect_equal(ir1, ir4)
- expect_equal(ir1, ir5)
- expect_equal(ir1, ir6)
-
- expect_error( iris %>% group_by(Species) %>% filter(column(~Sepal.Length + Species) < 5 ), "unhandled formula in column")
- expect_error( iris %>% group_by(Species) %>% filter(column(~foo) < 5), "expands to a symbol that is not a variable from the data")
- expect_error( iris %>% group_by(Species) %>% filter(column(letters) < 5), "column must return a single string" )
-
+ mutate(xx = LETTERS[gear]) %>%
+ filter(xx == lead(xx, default = "foo"))
+ xx <- LETTERS[mtcars$gear]
+ ok <- xx == lead(xx, default = "foo")
+ expect_equal(xx[ok], res$xx)
})
+# .data and .env tests now in test-hybrid-traverse.R
-test_that("each argument gets implicit parens", {
- df <- data_frame(
- v1 = c("a", "b", "a", "b"),
- v2 = c("b", "a", "a", "b"),
- v3 = c("a", "b", "c", "d")
+test_that("filter fails gracefully on raw columns (#1803)", {
+ df <- data_frame(a = 1:3, b = as.raw(1:3))
+ expect_error(
+ filter(df, a == 1),
+ "Column `b` is of unsupported type raw",
+ fixed = TRUE
+ )
+ expect_error(
+ filter(df, b == 1),
+ "Column `b` is of unsupported type raw",
+ fixed = TRUE
)
-
- tbls <- test_load(df)
-
- one <- tbls %>% lapply(. %>% filter((v1 == "a" | v2 == "a") & v3 == "a"))
- two <- tbls %>% lapply(. %>% filter(v1 == "a" | v2 == "a", v3 == "a"))
-
- lapply(seq_along(one), function(i) {
- expect_equal(collect(one[[i]]), collect(two[[i]]))
- })
})
-test_that("filter fails gracefully on raw columns (#1803)", {
- df <- data_frame(a = 1:3, b = as.raw(1:3))
- expect_error( filter(df, a == 1), "unsupported type" )
- expect_error( filter(df, b == 1), "unsupported type" )
+test_that("`vars` attribute is not added if empty (#2772)", {
+ expect_identical(tibble(x = 1:2) %>% filter(x == 1), tibble(x = 1L))
})
diff --git a/tests/testthat/test-funs-predicates.R b/tests/testthat/test-funs-predicates.R
new file mode 100644
index 0000000..b3b2b3a
--- /dev/null
+++ b/tests/testthat/test-funs-predicates.R
@@ -0,0 +1,23 @@
+context("funs-predicates")
+
+test_that("all_exprs() creates intersection", {
+ expect_identical(all_exprs(am == 1), quo(am == 1))
+
+ quo <- set_env(quo((!! quo(cyl == 2)) & (!! quo(am == 1))), base_env())
+ expect_identical(all_exprs(cyl == 2, am == 1), quo)
+})
+
+test_that("any_exprs() creates union", {
+ expect_identical(any_exprs(am == 1), quo(am == 1))
+
+ quo <- set_env(quo((!! quo(cyl == 2)) | (!! quo(am == 1))), base_env())
+ expect_identical(any_exprs(cyl == 2, am == 1), quo)
+})
+
+test_that("all_exprs() without expression returns an error", {
+ expect_error(
+ all_exprs(),
+ "At least one expression must be given",
+ fixed = TRUE
+ )
+})
diff --git a/tests/testthat/test-funs.R b/tests/testthat/test-funs.R
new file mode 100644
index 0000000..d946b50
--- /dev/null
+++ b/tests/testthat/test-funs.R
@@ -0,0 +1,61 @@
+context("funs")
+
+test_that("fun_list is merged with new args", {
+ funs <- funs(fn = bar)
+ funs <- as_fun_list(funs, quo(bar), env(), baz = "baz")
+ expect_identical(funs$fn, quo(bar(., baz = "baz")))
+})
+
+test_that("funs() works with namespaced calls", {
+ expect_identical(summarise_all(mtcars, funs(base::mean(.))), summarise_all(mtcars, funs(mean(.))))
+ expect_identical(summarise_all(mtcars, funs(base::mean)), summarise_all(mtcars, funs(mean(.))))
+})
+
+test_that("funs() accepts quoted functions", {
+ expect_identical(funs(mean), funs("mean"))
+})
+
+test_that("funs() accepts unquoted functions", {
+ funs <- funs(fn = !! mean)
+ expect_identical(funs$fn, new_quosure(lang(base::mean, quote(.))))
+})
+
+test_that("funs() accepts quoted calls", {
+ expect_identical(funs(mean), funs(mean(.)))
+})
+
+test_that("funs() can be merged with new arguments", {
+ fns <- funs(foo(.))
+ expect_identical(as_fun_list(fns, ~NULL, get_env(), foo = 1L), funs(foo(., foo = 1L)))
+})
+
+
+enfun <- function(.funs, ...) {
+ as_fun_list(.funs, enquo(.funs), caller_env(), ...)
+}
+
+test_that("can enfun() literal functions", {
+ expect_identical(enfun(identity(mean)), funs(!! mean))
+})
+
+test_that("can enfun() named functions by expression", {
+ expect_identical(enfun(mean), funs(mean(.)))
+})
+
+test_that("local objects are not treated as symbols", {
+ mean <- funs(my_mean(.))
+ expect_identical(enfun(mean), mean)
+})
+
+test_that("can enfun() character vectors", {
+ expect_identical(enfun(c("min", "max")), funs(min, max))
+})
+
+test_that("can enfun() quosures", {
+ expect_identical(enfun(quo(mean(.))), funs(mean(.)))
+})
+
+test_that("can enfun() purrr-style lambdas", {
+ my_mean <- as_function(~mean(.x))
+ expect_identical(enfun(~mean(.x)), funs(!! my_mean))
+})
diff --git a/tests/testthat/test-group-by.r b/tests/testthat/test-group-by.r
index f3e8fd0..d2cab9d 100644
--- a/tests/testthat/test-group-by.r
+++ b/tests/testthat/test-group-by.r
@@ -2,44 +2,26 @@ context("Group by")
df <- data.frame(x = rep(1:3, each = 10), y = rep(1:6, each = 5))
-tbls <- test_load(df)
-
test_that("group_by with add = TRUE adds groups", {
- add_groups1 <- function(tbl) groups(group_by(tbl, x, y, add = TRUE))
- add_groups2 <- function(tbl) groups(group_by(group_by(tbl, x, add = TRUE), y,
- add = TRUE))
-
- expect_equal(add_groups1(tbls$df), list(quote(x), quote(y)))
- expect_equal(add_groups2(tbls$df), list(quote(x), quote(y)))
+ add_groups1 <- function(tbl) group_by(tbl, x, y, add = TRUE)
+ add_groups2 <- function(tbl) group_by(group_by(tbl, x, add = TRUE), y, add = TRUE)
- skip_if_no_sqlite()
- expect_equal(add_groups1(tbls$sqlite), list(quote(x), quote(y)))
- expect_equal(add_groups2(tbls$sqlite), list(quote(x), quote(y)))
-})
-
-test_that("collect, collapse and compute preserve grouping", {
- skip_if_no_sqlite()
- g <- memdb_frame(x = 1:3, y = 1:3) %>% group_by(x, y)
-
- expect_equal(groups(compute(g)), groups(g))
- expect_equal(groups(collapse(g)), groups(g))
- expect_equal(groups(collect(g)), groups(g))
+ expect_groups(add_groups1(df), c("x", "y"))
+ expect_groups(add_groups2(df), c("x", "y"))
})
test_that("joins preserve grouping", {
- for (tbl in tbls) {
- g <- group_by(tbl, x)
+ g <- group_by(df, x)
- expect_equal(groups(inner_join(g, g, by = c("x", "y"))), groups(g))
- expect_equal(groups(left_join(g, g, by = c("x", "y"))), groups(g))
- expect_equal(groups(semi_join(g, g, by = c("x", "y"))), groups(g))
- expect_equal(groups(anti_join(g, g, by = c("x", "y"))), groups(g))
- }
+ expect_groups(inner_join(g, g, by = c("x", "y")), "x")
+ expect_groups(left_join (g, g, by = c("x", "y")), "x")
+ expect_groups(semi_join (g, g, by = c("x", "y")), "x")
+ expect_groups(anti_join (g, g, by = c("x", "y")), "x")
})
test_that("constructors drops groups", {
df <- data.frame(x = 1:3) %>% group_by(x)
- expect_equal(groups(tbl_df(df)), NULL)
+ expect_no_groups(tbl_df(df))
})
test_that("grouping by constant adds column (#410)", {
@@ -66,23 +48,23 @@ test_that("local group_by preserves variable types", {
expected <- data_frame(unique(df_var[[var]]), n = 1L)
names(expected)[1] <- var
- summarised <- df_var %>% group_by_(var) %>% summarise(n = n())
+ summarised <- df_var %>% group_by(!! sym(var)) %>% summarise(n = n())
expect_equal(summarised, expected, info = var)
}
})
-test_that("mutate does not loose variables (#144)",{
+test_that("mutate does not loose variables (#144)", {
df <- tbl_df(data.frame(a = rep(1:4, 2), b = rep(1:4, each = 2), x = runif(8)))
by_ab <- group_by(df, a, b)
- by_a <- summarise( by_ab, x = sum(x))
+ by_a <- summarise(by_ab, x = sum(x))
by_a_quartile <- group_by(by_a, quartile = ntile(x, 4))
- expect_equal(names(by_a_quartile), c("a", "b", "x", "quartile" ))
+ expect_equal(names(by_a_quartile), c("a", "b", "x", "quartile"))
})
test_that("group_by uses shallow copy", {
m1 <- group_by(mtcars, cyl)
- expect_true(is.null(groups(mtcars)))
+ expect_no_groups(mtcars)
expect_equal(dfloc(mtcars), dfloc(m1))
})
@@ -93,27 +75,34 @@ test_that("FactorVisitor handles NA. #183", {
})
test_that("group_by orders by groups. #242", {
- df <- data.frame(a = sample(1:10, 100, replace = TRUE)) %>% group_by(a)
- expect_equal( attr(df, "labels")$a, 1:10 )
+ df <- data.frame(a = sample(1:10, 3000, replace = TRUE)) %>% group_by(a)
+ expect_equal(attr(df, "labels")$a, 1:10)
- df <- data.frame(a = sample(letters[1:10], 100, replace = TRUE), stringsAsFactors = FALSE) %>% group_by(a)
- expect_equal(attr(df, "labels")$a, letters[1:10] )
+ df <- data.frame(a = sample(letters[1:10], 3000, replace = TRUE), stringsAsFactors = FALSE) %>% group_by(a)
+ expect_equal(attr(df, "labels")$a, letters[1:10])
- df <- data.frame(a = sample(sqrt(1:10), 100, replace = TRUE)) %>% group_by(a)
+ df <- data.frame(a = sample(sqrt(1:10), 3000, replace = TRUE)) %>% group_by(a)
expect_equal(attr(df, "labels")$a, sqrt(1:10))
-
})
test_that("group_by uses the white list", {
- df <- data.frame( times = 1:5 )
- df$times <- as.POSIXlt( seq.Date( Sys.Date(), length.out = 5, by = "day" ) )
- expect_error(group_by(df, times))
+ df <- data.frame(times = 1:5)
+ df$times <- as.POSIXlt(seq.Date(Sys.Date(), length.out = 5, by = "day"))
+ expect_error(
+ group_by(df, times),
+ "Column `times` is of unsupported class POSIXlt/POSIXt",
+ fixed = TRUE
+ )
})
-test_that("group_by fails when lists are used as grouping variables (#276)",{
+test_that("group_by fails when lists are used as grouping variables (#276)", {
df <- data.frame(x = 1:3)
df$y <- list(1:2, 1:3, 1:4)
- expect_error(group_by(df,y))
+ expect_error(
+ group_by(df, y),
+ "Column `y` can't be used as a grouping variable because it's a list",
+ fixed = TRUE
+ )
})
@@ -122,19 +111,31 @@ test_that("select(group_by(.)) implicitely adds grouping variables (#170)", {
expect_equal(names(res), c("vs", "mpg"))
})
-test_that("grouped_df errors on empty vars (#398)",{
+test_that("grouped_df errors on empty vars (#398)", {
m <- mtcars %>% group_by(cyl)
attr(m, "vars") <- NULL
attr(m, "indices") <- NULL
- expect_error( m %>% do(mpg = mean(.$mpg)) )
+ expect_error(
+ m %>% do(mpg = mean(.$mpg)),
+ "no variables to group by",
+ fixed = TRUE
+ )
+})
+
+test_that("grouped_df errors on non-existent var (#2330)", {
+ df <- data.frame(x = 1:5)
+ expect_error(
+ grouped_df(df, list(quote(y))),
+ "Column `y` is unknown"
+ )
})
test_that("group_by only creates one group for NA (#401)", {
- x <- as.numeric(c(NA,NA,NA,10:1,10:1))
- w <- c(20,30,40,1:10,1:10)*10
+ x <- as.numeric(c(NA, NA, NA, 10:1, 10:1))
+ w <- c(20, 30, 40, 1:10, 1:10) * 10
n_distinct(x) # 11 OK
- res <- data.frame(x=x,w=w) %>% group_by(x) %>% summarise(n=n())
+ res <- data.frame(x = x, w = w) %>% group_by(x) %>% summarise(n = n())
expect_equal(nrow(res), 11L)
})
@@ -148,41 +149,46 @@ test_that("there can be 0 groups (#486)", {
test_that("group_by works with zero-row data frames (#486)", {
dfg <- group_by(data.frame(a = numeric(0), b = numeric(0), g = character(0)), g)
expect_equal(dim(dfg), c(0, 3))
- expect_equal(groups(dfg), list(quote(g)))
+ expect_groups(dfg, "g")
expect_equal(group_size(dfg), integer(0))
x <- summarise(dfg, n = n())
expect_equal(dim(x), c(0, 2))
- expect_equal(groups(x), NULL)
+ expect_no_groups(x)
x <- mutate(dfg, c = b + 1)
expect_equal(dim(x), c(0, 4))
- expect_equal(groups(x), list(quote(g)))
+ expect_groups(x, "g")
expect_equal(group_size(x), integer(0))
x <- filter(dfg, a == 100)
expect_equal(dim(x), c(0, 3))
- expect_equal(groups(x), list(quote(g)))
+ expect_groups(x, "g")
expect_equal(group_size(x), integer(0))
x <- arrange(dfg, a, g)
expect_equal(dim(x), c(0, 3))
- expect_equal(groups(x), list(quote(g)))
+ expect_groups(x, "g")
expect_equal(group_size(x), integer(0))
x <- select(dfg, a) # Only select 'a' column; should result in 'g' and 'a'
expect_equal(dim(x), c(0, 2))
- expect_equal(groups(x), list(quote(g)))
+ expect_groups(x, "g")
expect_equal(group_size(x), integer(0))
})
test_that("grouped_df requires a list of symbols (#665)", {
features <- list("feat1", "feat2", "feat3")
- expect_error( grouped_df(data.frame(feat1=1, feat2=2, feat3=3), features) )
+ # error message by assertthat
+ expect_error(grouped_df(data.frame(feat1 = 1, feat2 = 2, feat3 = 3), features))
})
-test_that("group_by gives meaningful message with unknow column (#716)",{
- expect_error( group_by(iris, wrong_name_of_variable), "unknown variable to group by" )
+test_that("group_by gives meaningful message with unknow column (#716)", {
+ expect_error(
+ group_by(iris, wrong_name_of_variable),
+ "Column `wrong_name_of_variable` is unknown",
+ fixed = TRUE
+ )
})
test_that("[ on grouped_df preserves grouping if subset includes grouping vars", {
@@ -200,59 +206,92 @@ test_that("[ on grouped_df drops grouping if subset doesn't include grouping var
by_cyl <- mtcars %>% group_by(cyl)
no_cyl <- by_cyl %>% `[`(c(1, 3))
- expect_equal(groups(no_cyl), NULL)
+ expect_no_groups(no_cyl)
expect_is(no_cyl, "tbl_df")
})
-test_that("group_by works after arrange (#959)",{
- df <- data_frame(Log= c(1,2,1,2,1,2), Time = c(10,1,3,0,15,11))
+test_that("group_by works after arrange (#959)", {
+ df <- data_frame(Log = c(1, 2, 1, 2, 1, 2), Time = c(10, 1, 3, 0, 15, 11))
res <- df %>%
- arrange(Time) %>%
- group_by(Log) %>%
- mutate(Diff = Time - lag(Time))
- expect_true( all(is.na( res$Diff[ c(1,3) ] )))
- expect_equal( res$Diff[ c(2,4,5,6) ], c(1,7,10,5) )
+ arrange(Time) %>%
+ group_by(Log) %>%
+ mutate(Diff = Time - lag(Time))
+ expect_true(all(is.na(res$Diff[c(1, 3)])))
+ expect_equal(res$Diff[c(2, 4, 5, 6)], c(1, 7, 10, 5))
})
test_that("group_by keeps attributes", {
- d <- data.frame( x = structure( 1:10, foo = "bar" ) )
+ d <- data.frame(x = structure(1:10, foo = "bar"))
gd <- group_by(d)
- expect_equal( attr(gd$x, "foo"), "bar")
+ expect_equal(attr(gd$x, "foo"), "bar")
})
test_that("ungroup.rowwise_df gives a tbl_df (#936)", {
res <- tbl_df(mtcars) %>% rowwise %>% ungroup %>% class
- expect_equal( res, c("tbl_df", "data.frame"))
+ expect_equal(res, c("tbl_df", "tbl", "data.frame"))
})
-test_that( "group_by supports column (#1012)", {
- g1 <- mtcars %>% group_by(cyl)
- g2 <- mtcars %>% group_by(column(~cyl))
- g3 <- mtcars %>% group_by(column("cyl"))
- a <- "cyl"
- g4 <- mtcars %>% group_by(column(a))
-
- expect_equal( attr(g1, "vars"), attr(g2, "vars"))
- expect_equal( attr(g1, "vars"), attr(g3, "vars"))
- expect_equal( attr(g1, "vars"), attr(g4, "vars"))
-})
-
-test_that("group_by handles encodings (#1507)", {
- skip_on_os("windows") # 1950
-
- df <- data.frame(x=1:3, Eng=2:4)
- names(df) <- enc2utf8(c("\u00e9", "Eng"))
- res <- group_by_(df, iconv("\u00e9", from = "UTF-8", to = "latin1") )
- expect_equal( names(res), names(df) )
+test_that(paste0("group_by handles encodings for native strings (#1507)"), {
+ with_non_utf8_encoding({
+ special <- get_native_lang_string()
+
+ df <- data.frame(x = 1:3, Eng = 2:4)
+
+ for (names_converter in c(enc2native, enc2utf8)) {
+ for (dots_converter in c(enc2native, enc2utf8)) {
+ names(df) <- names_converter(c(special, "Eng"))
+ res <- group_by(df, !!! syms(dots_converter(special)))
+ expect_equal(names(res), names(df))
+ expect_groups(res, special)
+ }
+ }
+
+ for (names_converter in c(enc2native, enc2utf8)) {
+ names(df) <- names_converter(c(special, "Eng"))
+
+ res <- group_by(df, !!! special)
+ expect_equal(names(res), c(names(df), deparse(special)))
+ expect_equal(groups(res), list(as.name(enc2native(deparse(special)))))
+ }
+ })
})
test_that("group_by fails gracefully on raw columns (#1803)", {
df <- data_frame(a = 1:3, b = as.raw(1:3))
- expect_error( group_by(df, a), "unsupported type" )
- expect_error( group_by(df, b), "unsupported type" )
+ expect_error(
+ group_by(df, a),
+ "Column `b` is of unsupported type raw",
+ fixed = TRUE
+ )
+ expect_error(
+ group_by(df, b),
+ "Column `b` is of unsupported type raw",
+ fixed = TRUE
+ )
})
test_that("rowwise fails gracefully on raw columns (#1803)", {
df <- data_frame(a = 1:3, b = as.raw(1:3))
- expect_error( rowwise(df), "unsupported type" )
+ expect_error(
+ rowwise(df),
+ "Column `b` is of unsupported type raw",
+ fixed = TRUE
+ )
+})
+
+test_that("group_by() names pronouns correctly (#2686)", {
+ expect_named(group_by(tibble(x = 1), .data$x), "x")
+ expect_named(group_by(tibble(x = 1), .data[["x"]]), "x")
+})
+
+test_that("group_by() does not affect input data (#3028)", {
+ x <-
+ data.frame(old1 = c(1, 2, 3), old2 = c(4, 5, 6)) %>%
+ group_by(old1)
+
+ y <-
+ x %>%
+ select(new1 = old1, new2 = old2)
+
+ expect_identical(groups(x), syms(quote(old1)))
})
diff --git a/tests/testthat/test-group-indices.R b/tests/testthat/test-group-indices.R
index 1781221..878869f 100644
--- a/tests/testthat/test-group-indices.R
+++ b/tests/testthat/test-group-indices.R
@@ -1,19 +1,33 @@
context("Group indices")
test_that("group_indices from ungrouped or grouped gives same result", {
- res1 <- group_indices( mtcars, cyl, vs, am )
+ res1 <- group_indices(mtcars, cyl, vs, am)
res2 <- mtcars %>% group_by(cyl, vs, am) %>% group_indices()
- expect_equal(res1, res2)
+ expect_equal(res1, res2)
})
test_that("group_indices handles the case where no variable is given (#867)", {
res <- group_indices(mtcars)
- expect_true( all(res==1L) )
+ expect_true(all(res == 1L))
})
test_that("group_indices handles grouped data and no arguments", {
res1 <- mtcars %>% group_by(cyl) %>% group_indices()
res2 <- mtcars %>% group_indices(cyl)
- expect_equal(res1, res2)
+ expect_equal(res1, res2)
})
+test_that("group_indices can be used in mutate (#2160)", {
+ res1 <- mtcars %>% mutate(., group_idx = group_indices(., cyl))
+ res2 <- mtcars %>% mutate(group_idx = as.integer(factor(cyl)))
+ expect_equal(res1, res2)
+})
+
+test_that("group indices are updated correctly for joined grouped data frames (#2330)", {
+ d1 <- data.frame(x = 1:2, y = 1:2) %>% group_by(x, y)
+ expect_equal(group_indices(d1), d1$x)
+
+ d2 <- expand.grid(x = 1:2, y = 1:2)
+ res <- inner_join(d1, d2, by = "x")
+ expect_equal(group_indices(res), res$x)
+})
diff --git a/tests/testthat/test-group-size.R b/tests/testthat/test-group-size.R
index 9a6e7c6..0ed5b77 100644
--- a/tests/testthat/test-group-size.R
+++ b/tests/testthat/test-group-size.R
@@ -1,25 +1,37 @@
context("Group sizes")
-df <- data.frame(x = rep(1:3, each = 10), y = rep(1:6, each = 5))
-tbls <- test_load(df)
-
test_that("ungrouped data has 1 group, with group size = nrow()", {
- for (tbl in tbls) {
- expect_equal(n_groups(tbl), 1L)
- expect_equal(group_size(tbl), 30)
- }
+ df <- tibble(x = rep(1:3, each = 10), y = rep(1:6, each = 5))
+
+ expect_equal(n_groups(df), 1L)
+ expect_equal(group_size(df), 30)
})
test_that("rowwise data has one group for each group", {
- rw <- rowwise(df)
- expect_equal(n_groups(rw), 30)
- expect_equal(group_size(rw), rep(1, 30))
+ rw <- rowwise(mtcars)
+ expect_equal(n_groups(rw), 32)
+ expect_equal(group_size(rw), rep(1, 32))
})
test_that("group_size correct for grouped data", {
- for (tbl in tbls) {
- grp <- group_by(tbl, x)
- expect_equal(n_groups(grp), 3L)
- expect_equal(group_size(grp), rep(10, 3))
- }
+ df <- tibble(x = rep(1:3, each = 10), y = rep(1:6, each = 5)) %>% group_by(x)
+ expect_equal(n_groups(df), 3L)
+ expect_equal(group_size(df), rep(10, 3))
+})
+
+
+# For following tests, add an extra level that's not present in data
+test_that("n_groups drops zero-length groups", {
+ df <- tibble(x = factor(1:3, levels = 1:4)) %>% group_by(x)
+ expect_equal(n_groups(df), 3)
+})
+
+test_that("summarise drops zero-length groups", {
+ df <- tibble(x = factor(rep(1:3, each = 10)))
+
+ out <- df %>%
+ group_by(x) %>%
+ summarise(n = n())
+
+ expect_equal(out$n, c(10L, 10L, 10L))
})
diff --git a/tests/testthat/test-hybrid-traverse.R b/tests/testthat/test-hybrid-traverse.R
new file mode 100644
index 0000000..a4d2684
--- /dev/null
+++ b/tests/testthat/test-hybrid-traverse.R
@@ -0,0 +1,464 @@
+context("hybrid-traverse")
+
+test_df <- data_frame(
+ id = c(1L, 2L, 2L),
+ a = 1:3,
+ b = as.numeric(1:3),
+ c = letters[1:3],
+ d = c(TRUE, FALSE, NA),
+ e = list(list(a = 1, x = 2), list(a = 2, x = 3), list(a = 3, x = 4))
+)
+
+test_that("$ is parsed correctly (#1400)", {
+ grouping <- rowwise
+
+ expect_equal(
+ test_df %>%
+ grouping %>%
+ mutate(f = e$x) %>%
+ select(-e),
+ test_df %>%
+ mutate(f = as.numeric(2:4)) %>%
+ grouping %>%
+ select(-e)
+ )
+})
+
+test_that("$ is parsed correctly if column by the same name exists (#1400)", {
+ grouping <- rowwise
+
+ expect_equal(
+ test_df %>%
+ grouping %>%
+ mutate(f = e$a) %>%
+ select(-e),
+ test_df %>%
+ mutate(f = as.numeric(1:3)) %>%
+ grouping %>%
+ select(-e)
+ )
+})
+
+test_that("[[ works for ungrouped access (#912)", {
+ grouping <- identity
+
+ expect_equal(
+ test_df %>%
+ grouping %>%
+ mutate(f = mean(test_df[["a"]])) %>%
+ select(-e),
+ test_df %>%
+ mutate(f = mean(a)) %>%
+ grouping %>%
+ select(-e)
+ )
+})
+
+test_that("[[ works for rowwise access of list columns (#912)", {
+ grouping <- rowwise
+
+ df <- tibble(
+ x = c("a", "b"),
+ y = list(list(a = 1, b = 2), list(a = 3, b = 4))
+ )
+
+ expect_equal(
+ df %>% rowwise() %>% transmute(z = y[[x]]),
+ data_frame(z = c(1, 4))
+ )
+})
+
+test_that("$ works for rle result (#2125)", {
+ grouping <- identity
+
+ expect_equal(
+ test_df %>%
+ grouping %>%
+ mutate(f = rle(b)$lengths) %>%
+ select(-e),
+ test_df %>%
+ mutate(f = rep(1L, 3L)) %>%
+ grouping %>%
+ select(-e)
+ )
+})
+
+test_hybrid <- function(grouping) {
+ test_that("case_when() works for LHS (#1719, #2244)", {
+ expect_equal(
+ test_df %>%
+ grouping %>%
+ mutate(f = case_when(a == 1 ~ 1, a == 2 ~ 2, TRUE ~ 3)) %>%
+ select(-e),
+ test_df %>%
+ mutate(f = b) %>%
+ grouping %>%
+ select(-e)
+ )
+ })
+
+ test_that("case_when() works for RHS (#1719, #2244)", {
+ expect_equal(
+ test_df %>%
+ grouping %>%
+ mutate(f = case_when(a == 1 ~ as.numeric(a), a == 2 ~ b, TRUE ~ 3)) %>%
+ select(-e),
+ test_df %>%
+ mutate(f = b) %>%
+ grouping %>%
+ select(-e)
+ )
+ })
+
+ test_that("assignments work (#1452)", {
+ expect_false(env_has(nms = "xx"))
+ expect_equal(
+ test_df %>%
+ grouping %>%
+ mutate(f = {
+ xx <- 5
+ xx
+ }) %>%
+ select(-e),
+ test_df %>%
+ mutate(f = 5) %>%
+ grouping %>%
+ select(-e)
+ )
+ expect_false(env_has(nms = "xx"))
+ })
+
+ test_that("assignments don't change variable (#315, #1452)", {
+ expect_false(env_has(nms = "a"))
+ expect_equal(
+ test_df %>%
+ grouping %>%
+ mutate(f = {
+ a <- 5
+ a
+ }) %>%
+ select(-e),
+ test_df %>%
+ mutate(f = 5) %>%
+ grouping %>%
+ select(-e)
+ )
+ expect_false(env_has(nms = "a"))
+ })
+
+ test_that("assignments don't carry over (#1452)", {
+ # error messages by bindr/rlang
+ expect_error(
+ test_df %>%
+ grouping %>%
+ mutate(f = { xx <- 5; xx }, g = xx),
+ "xx"
+ )
+ })
+
+ test_that("assignments don't leak (#1452)", {
+ expect_false(env_has(nms = "a"))
+ test <-
+ test_df %>%
+ grouping %>%
+ mutate(f = {
+ xx <- 5
+ xx
+ })
+ expect_false(env_has(nms = "a"))
+ })
+
+ test_that("[ works (#912)", {
+ grouped_df <- test_df %>%
+ grouping
+
+ expect_equal(
+ grouped_df %>%
+ mutate(f = mean(grouped_df["a"][[1]])) %>%
+ select(-e),
+ test_df %>%
+ mutate(f = mean(a)) %>%
+ grouping %>%
+ select(-e)
+ )
+ })
+
+ test_that("interpolation works (#1012)", {
+ var <- quo(b)
+
+ expect_equal(
+ test_df %>%
+ grouping %>%
+ mutate(., f = mean(UQ(var))) %>%
+ select(-e),
+ test_df %>%
+ grouping %>%
+ mutate(f = mean(b)) %>%
+ select(-e)
+ )
+ })
+
+ test_that("can compute 1 - ecdf(y)(y) (#2018)", {
+ surv <- function(x) 1 - ecdf(x)(x)
+
+ expect_equal(
+ test_df %>%
+ grouping %>%
+ mutate(., f = 1 - ecdf(b)(b)) %>%
+ select(-e),
+ test_df %>%
+ grouping %>%
+ mutate(., f = surv(b)) %>%
+ select(-e)
+ )
+ })
+
+ test_that("filter understands .data (#1012)", {
+ expect_equal(
+ test_df %>%
+ grouping %>%
+ filter({
+ b <- 5
+ .data$b < 2
+ }) %>%
+ select(-e),
+ test_df %>%
+ grouping %>%
+ filter(b < 2) %>%
+ select(-e)
+ )
+ })
+
+ test_that("filter understands .data (#1012)", {
+ expect_equal(
+ test_df %>%
+ grouping %>%
+ filter(.data[["b"]] < 2) %>%
+ select(-e),
+ test_df %>%
+ grouping %>%
+ filter(b < 2) %>%
+ select(-e)
+ )
+ })
+
+ test_that("filter understands .data (#1012)", {
+ idx <- 2L
+
+ expect_equal(
+ test_df %>%
+ grouping %>%
+ filter(.data[[letters[[idx]]]] < 2) %>%
+ select(-e),
+ test_df %>%
+ grouping %>%
+ filter(b < 2) %>%
+ select(-e)
+ )
+ })
+
+ test_that("filter understands .env (#1469)", {
+ b <- 2L
+
+ expect_equal(
+ filter(
+ test_df %>%
+ grouping,
+ b < .env$b) %>%
+ select(-e),
+ test_df %>%
+ grouping %>%
+ filter(b < 2) %>%
+ select(-e)
+ )
+ })
+
+ test_that("filter understands get(..., .env) in a pipe (#1469)", {
+ b <- 2L
+
+ expect_equal(
+ test_df %>%
+ grouping %>%
+ filter(b < get("b", envir = .env)) %>%
+ select(-e),
+ test_df %>%
+ grouping %>%
+ filter(b < 2) %>%
+ select(-e)
+ )
+ })
+
+ test_that("mutate understands .data (#1012)", {
+ expect_equal(
+ test_df %>%
+ grouping %>%
+ mutate(f = {
+ b <- 5
+ .data$b
+ }) %>%
+ select(-e),
+ test_df %>%
+ grouping %>%
+ mutate(f = b) %>%
+ select(-e)
+ )
+ })
+
+ test_that("mutate understands .data (#1012)", {
+ expect_equal(
+ test_df %>%
+ grouping %>%
+ mutate(f = .data[["b"]]) %>%
+ select(-e),
+ test_df %>%
+ grouping %>%
+ mutate(f = b) %>%
+ select(-e)
+ )
+ })
+
+ test_that("mutate understands .data (#1012)", {
+ idx <- 2L
+
+ expect_equal(
+ test_df %>%
+ grouping %>%
+ mutate(f = .data[[letters[[idx]]]]) %>%
+ select(-e),
+ test_df %>%
+ grouping %>%
+ mutate(f = b) %>%
+ select(-e)
+ )
+ })
+
+ test_that("mutate understands .env (#1469)", {
+ b <- 2L
+
+ expect_equal(
+ mutate(
+ test_df %>%
+ grouping,
+ f = .env$b) %>%
+ select(-e),
+ test_df %>%
+ grouping %>%
+ mutate(f = 2L) %>%
+ select(-e)
+ )
+ })
+
+ test_that("mutate understands get(..., .env) in a pipe (#1469)", {
+ b <- 2L
+
+ expect_equal(
+ test_df %>%
+ grouping %>%
+ mutate(f = get("b", .env)) %>%
+ select(-e),
+ test_df %>%
+ grouping %>%
+ mutate(f = 2L) %>%
+ select(-e)
+ )
+ })
+
+ test_that("summarise understands .data (#1012)", {
+ expect_equal(
+ test_df %>%
+ grouping %>%
+ summarise(f = { b <- 5; sum(.data$b) }),
+ test_df %>%
+ grouping %>%
+ summarise(f = sum(b)))
+ })
+
+ test_that("summarise understands .data (#1012)", {
+ expect_equal(
+ test_df %>%
+ grouping %>%
+ summarise(f = sum(.data[["b"]])),
+ test_df %>%
+ grouping %>%
+ summarise(f = sum(b))
+ )
+ })
+
+ test_that("summarise understands .data (#1012)", {
+ idx <- 2L
+
+ expect_equal(
+ test_df %>%
+ grouping %>%
+ summarise(f = sum(.data[[letters[[idx]]]])),
+ test_df %>%
+ grouping %>%
+ summarise(f = sum(b))
+ )
+ })
+
+ test_that("summarise understands .env (#1469)", {
+ b <- 2L
+
+ expect_equal(
+ summarise(
+ test_df %>%
+ grouping,
+ f = .env$b),
+ test_df %>%
+ grouping %>%
+ summarise(f = 2L)
+ )
+ })
+
+ test_that("summarise understands get(..., .env) in a pipe (#1469)", {
+ b <- 2L
+
+ expect_equal(
+ test_df %>%
+ grouping %>%
+ summarise(f = get("b", .env)),
+ test_df %>%
+ grouping %>%
+ summarise(f = 2L)
+ )
+ })
+
+ test_that("columns named .data and .env are overridden", {
+ conflict_data <- data_frame(id = test_df$id, .data = 1:3, .env = 3:1)
+
+ expect_equal(
+ conflict_data %>%
+ grouping %>%
+ summarise(env = list(.env), data = list(.data)) %>%
+ ungroup() %>%
+ summarise(
+ is_env_env = all(vapply(env, is.environment, logical(1))),
+ is_data_env = all(vapply(env, is.environment, logical(1)))
+ ),
+ data_frame(is_env_env = TRUE, is_data_env = TRUE)
+ )
+ })
+
+ test_that("contents of columns named .data and .env can be accessed", {
+ conflict_data <- data_frame(id = test_df$id, .data = 1:3, .env = 3:1)
+
+ expect_equal(
+ conflict_data %>%
+ grouping %>%
+ summarise(
+ env = mean(.data$.env),
+ data = mean(.data$.data)
+ ),
+ conflict_data %>%
+ rename(env = .env, data = .data) %>%
+ grouping %>%
+ summarise_at(vars(env, data), funs(mean))
+ )
+ })
+
+}
+
+test_hybrid(identity)
+test_hybrid(rowwise)
+test_hybrid(. %>% group_by(!! quo(id)))
diff --git a/tests/testthat/test-hybrid.R b/tests/testthat/test-hybrid.R
new file mode 100644
index 0000000..a2aa395
--- /dev/null
+++ b/tests/testthat/test-hybrid.R
@@ -0,0 +1,888 @@
+context("hybrid")
+
+test_that("hybrid evaluation environment is cleaned up (#2358)", {
+ # Can't use pipe here, f and g should have top-level parent.env()
+ df <- data_frame(x = 1)
+ df <- mutate(df, f = list(function(){}))
+ df <- mutate(df, g = list(quo(.)))
+ df <- mutate(df, h = list(~.))
+
+ expect_environments_clean(df$f[[1]])
+ expect_environments_clean(df$g[[1]])
+ expect_environments_clean(df$h[[1]])
+
+ # Avoids "Empty test" message
+ expect_true(TRUE)
+})
+
+test_that("n() and n_distinct() work", {
+ check_hybrid_result(
+ n(), a = 1:5,
+ expected = 5L, test_eval = FALSE
+ )
+ check_not_hybrid_result(
+ list(1:n()), a = 1:5,
+ expected = list(1:5), test_eval = FALSE
+ )
+
+ check_hybrid_result(
+ n_distinct(a), a = 1:5,
+ expected = 5L
+ )
+ check_hybrid_result(
+ n_distinct(a), a = rep(1L, 3),
+ expected = 1L
+ )
+ check_hybrid_result(
+ n_distinct(a, b), a = rep(1L, 3), b = 1:3,
+ expected = 3L
+ )
+ check_hybrid_result(
+ n_distinct(a, b), a = rep(1L, 3), b = c(1, 1, 2),
+ expected = 2L
+ )
+ check_hybrid_result(
+ n_distinct(a, b), a = rep(1L, 3), b = c(1, 1, NA),
+ expected = 2L
+ )
+ check_hybrid_result(
+ n_distinct(a, b, na.rm = TRUE), a = rep(1L, 3), b = c(1, 1, NA),
+ expected = 1L
+ )
+ check_hybrid_result(
+ n_distinct(a = a, b = b, na.rm = TRUE), a = rep(1L, 3), b = c(1, 1, NA),
+ expected = 1L
+ )
+
+ expect_not_hybrid_error(
+ n_distinct(), a = 1:5,
+ error = "Need at least one column for `n_distinct[(][)]`"
+ )
+})
+
+test_that("%in% works (#192)", {
+ # compilation errors on Windows
+ # https://ci.appveyor.com/project/hadley/dplyr/build/1.0.230
+ check_not_hybrid_result(
+ list(a %in% (1:3 * 1i)), a = 2:4 * 1i,
+ expected = list(c(TRUE, TRUE, FALSE))
+ )
+
+ check_not_hybrid_result(
+ list(a %in% 1:3), a = as.numeric(2:4),
+ expected = list(c(TRUE, TRUE, FALSE))
+ )
+ check_not_hybrid_result(
+ list(a %in% as.numeric(1:3)), a = 2:4,
+ expected = list(c(TRUE, TRUE, FALSE))
+ )
+
+ c <- 2:4
+ check_not_hybrid_result(
+ list(c %in% 1:3), a = as.numeric(2:4),
+ expected = list(c(TRUE, TRUE, FALSE))
+ )
+})
+
+test_that("min() and max() work", {
+ check_hybrid_result(
+ min(a), a = 1:5,
+ expected = 1,
+ test_eval = FALSE
+ )
+ check_hybrid_result(
+ max(a), a = 1:5,
+ expected = 5,
+ test_eval = FALSE
+ )
+ check_hybrid_result(
+ min(a), a = as.numeric(1:5),
+ expected = 1
+ )
+ check_hybrid_result(
+ max(a), a = as.numeric(1:5),
+ expected = 5
+ )
+ check_hybrid_result(
+ min(a), a = c(1:5, NA),
+ expected = NA_real_,
+ test_eval = FALSE
+ )
+ check_hybrid_result(
+ max(a), a = c(1:5, NA),
+ expected = NA_real_,
+ test_eval = FALSE
+ )
+ check_hybrid_result(
+ min(a), a = c(NA, 1:5),
+ expected = NA_real_,
+ test_eval = FALSE
+ )
+ check_hybrid_result(
+ max(a), a = c(NA, 1:5),
+ expected = NA_real_,
+ test_eval = FALSE
+ )
+
+ c <- 1:3
+ check_not_hybrid_result(
+ min(c), a = 1:5,
+ expected = 1L
+ )
+ check_not_hybrid_result(
+ max(c), a = 1:5,
+ expected = 3L
+ )
+
+ check_not_hybrid_result(
+ min(a), a = letters,
+ expected = "a"
+ )
+ check_not_hybrid_result(
+ max(a), a = letters,
+ expected = "z"
+ )
+ check_not_hybrid_result(
+ min(a), a = c(letters, NA),
+ expected = NA_character_
+ )
+ check_not_hybrid_result(
+ max(a), a = c(letters, NA),
+ expected = NA_character_
+ )
+ check_not_hybrid_result(
+ min(a, na.rm = TRUE), a = c(letters, NA),
+ expected = "a"
+ )
+ check_not_hybrid_result(
+ max(a, na.rm = TRUE), a = c(letters, NA),
+ expected = "z"
+ )
+
+ check_hybrid_result(
+ min(a, na.rm = TRUE), a = NA_real_,
+ expected = Inf,
+ test_eval = FALSE
+ )
+ check_hybrid_result(
+ max(a, na.rm = TRUE), a = NA_real_,
+ expected = -Inf,
+ test_eval = FALSE
+ )
+ check_hybrid_result(
+ min(a), a = numeric(),
+ expected = Inf,
+ test_eval = FALSE
+ )
+ check_hybrid_result(
+ max(a), a = numeric(),
+ expected = -Inf,
+ test_eval = FALSE
+ )
+
+ check_hybrid_result(
+ max(a, na.rm = TRUE), a = NA_integer_,
+ expected = -Inf,
+ test_eval = FALSE
+ )
+ check_hybrid_result(
+ min(a, na.rm = TRUE), a = NA_integer_,
+ expected = Inf,
+ test_eval = FALSE
+ )
+ check_hybrid_result(
+ max(a), a = integer(),
+ expected = -Inf,
+ test_eval = FALSE
+ )
+ check_hybrid_result(
+ min(a), a = integer(),
+ expected = Inf,
+ test_eval = FALSE
+ )
+})
+
+test_that("first(), last(), and nth() work", {
+ check_hybrid_result(
+ first(a), a = 1:5,
+ expected = 1L
+ )
+ check_hybrid_result(
+ last(a), a = as.numeric(1:5),
+ expected = 5
+ )
+ check_hybrid_result(
+ nth(a, 6, default = 3), a = as.numeric(1:5),
+ expected = 3
+ )
+ check_hybrid_result(
+ nth(a, 6, def = 3), a = as.numeric(1:5),
+ expected = 3
+ )
+ check_hybrid_result(
+ nth(a, 6.5), a = 1:5,
+ expected = NA_integer_
+ )
+
+ check_not_hybrid_result(
+ nth(a, b[[2]]), a = letters[1:5], b = 5:1,
+ expected = "d"
+ )
+
+ check_hybrid_result(
+ nth(a, 3), a = as.numeric(1:5) * 1i,
+ expected = 3i
+ )
+ check_not_hybrid_result(
+ nth(a, 2), a = as.list(1:5),
+ expected = 2L
+ )
+
+ check_not_hybrid_result(
+ nth(a, order_by = 5:1, 2), a = 1:5,
+ expected = 4L
+ )
+ expect_not_hybrid_error(
+ first(a, bogus = 3), a = 1:5,
+ error = "unused argument"
+ )
+ expect_not_hybrid_error(
+ last(a, bogus = 3), a = 1:5,
+ error = "unused argument"
+ )
+ expect_not_hybrid_error(
+ nth(a, 3, bogus = 3), a = 1:5,
+ error = "unused argument"
+ )
+
+ c <- 1:3
+ check_not_hybrid_result(
+ first(c), a = 2:4,
+ expected = 1L
+ )
+ check_not_hybrid_result(
+ last(c), a = 2:4,
+ expected = 3L
+ )
+ check_not_hybrid_result(
+ nth(c, 2), a = 2:4,
+ expected = 2L
+ )
+
+ check_hybrid_result(
+ first(a, order_by = b), a = 1:5, b = 5:1,
+ expected = 5L
+ )
+
+ default_value <- 6L
+ check_not_hybrid_result(
+ nth(a, 6, default = default_value), a = 1:5,
+ expected = 6L
+ )
+
+ expect_equal(
+ tibble(a = c(1, 1, 2), b = letters[1:3]) %>%
+ group_by(a) %>%
+ summarize(b = b[1], b = first(b)) %>%
+ ungroup,
+ tibble(a = c(1, 2), b = c("a", "c"))
+ )
+})
+
+test_that("lead() and lag() work", {
+ check_hybrid_result(
+ list(lead(a)), a = 1:5,
+ expected = list(c(2:5, NA))
+ )
+ check_hybrid_result(
+ list(lag(a)), a = 1:5,
+ expected = list(c(NA, 1:4))
+ )
+
+ check_hybrid_result(
+ list(lead(a)), a = as.numeric(1:5),
+ expected = list(c(as.numeric(2:5), NA))
+ )
+ check_hybrid_result(
+ list(lag(a)), a = as.numeric(1:5),
+ expected = list(c(NA, as.numeric(1:4)))
+ )
+
+ check_hybrid_result(
+ list(lead(a)), a = letters[1:5],
+ expected = list(c(letters[2:5], NA))
+ )
+ check_hybrid_result(
+ list(lag(a)), a = letters[1:5],
+ expected = list(c(NA, letters[1:4]))
+ )
+
+ check_hybrid_result(
+ list(lead(a)), a = c(TRUE, FALSE),
+ expected = list(c(FALSE, NA))
+ )
+ check_hybrid_result(
+ list(lag(a)), a = c(TRUE, FALSE),
+ expected = list(c(NA, TRUE))
+ )
+
+ check_not_hybrid_result(
+ list(lead(a, order_by = b)), a = 1:5, b = 5:1,
+ expected = list(c(NA, 1:4))
+ )
+ check_not_hybrid_result(
+ list(lag(a, order_by = b)), a = 1:5, b = 5:1,
+ expected = list(c(2:5, NA))
+ )
+
+ check_hybrid_result(
+ list(lead(a)), a = 1:5 * 1i,
+ expected = list(c(2:5, NA) * 1i)
+ )
+ check_hybrid_result(
+ list(lag(a)), a = 1:5 * 1i,
+ expected = list(c(NA, 1:4) * 1i)
+ )
+
+ v <- 1:2
+ check_not_hybrid_result(
+ list(lead(a, v[1])), a = 1:5,
+ expected = list(c(2:5, NA))
+ )
+ check_not_hybrid_result(
+ list(lag(a, v[1])), a = 1:5,
+ expected = list(c(NA, 1:4))
+ )
+})
+
+test_that("mean(), var(), sd() and sum() work", {
+ check_hybrid_result(
+ mean(a), a = 1:5,
+ expected = 3
+ )
+ check_hybrid_result(
+ var(a), a = 1:3,
+ expected = 1
+ )
+ check_hybrid_result(
+ sd(a), a = 1:3,
+ expected = 1
+ )
+ check_hybrid_result(
+ sum(a), a = 1:5,
+ expected = 15L
+ )
+ check_hybrid_result(
+ sum(a), a = as.numeric(1:5),
+ expected = 15
+ )
+
+ check_hybrid_result(
+ mean(a), a = c(1:5, NA),
+ expected = NA_real_
+ )
+ check_hybrid_result(
+ var(a), a = c(1:3, NA),
+ expected = NA_real_
+ )
+ check_hybrid_result(
+ sum(a), a = c(1:5, NA),
+ expected = NA_integer_
+ )
+ check_hybrid_result(
+ sum(a), a = c(as.numeric(1:5), NA),
+ expected = NA_real_
+ )
+
+ check_not_hybrid_result(
+ sd(a, TRUE), a = c(1:3, NA),
+ expected = 1
+ )
+
+ check_not_hybrid_result(
+ sd(a, na.rm = b[[1]]), a = c(1:3, NA), b = TRUE,
+ expected = 1
+ )
+
+ check_hybrid_result(
+ sd(a), a = c(1:3, NA),
+ expected = is.na
+ )
+})
+
+test_that("row_number(), ntile(), min_rank(), percent_rank(), dense_rank(), and cume_dist() work", {
+ check_hybrid_result(
+ list(row_number()), a = 1:5,
+ expected = list(1:5),
+ test_eval = FALSE
+ )
+ check_hybrid_result(
+ list(row_number(a)), a = 5:1,
+ expected = list(5:1)
+ )
+ check_hybrid_result(
+ list(min_rank(a)), a = c(1, 3, 2, 3, 1),
+ expected = list(c(1L, 4L, 3L, 4L, 1L))
+ )
+ check_hybrid_result(
+ list(percent_rank(a)), a = c(1, 3, 2, 3, 1),
+ expected = list((c(1L, 4L, 3L, 4L, 1L) - 1) / 4)
+ )
+ check_hybrid_result(
+ list(cume_dist(a)), a = c(1, 3, 2, 3),
+ expected = list(c(0.25, 1.0, 0.5, 1.0))
+ )
+ check_hybrid_result(
+ list(dense_rank(a)), a = c(1, 3, 2, 3, 1),
+ expected = list(c(1L, 3L, 2L, 3L, 1L))
+ )
+
+ expect_not_hybrid_error(
+ row_number(a, 1), a = 5:1,
+ error = "unused argument"
+ )
+ expect_not_hybrid_error(
+ min_rank(a, 1), a = 5:1,
+ error = "unused argument"
+ )
+ expect_not_hybrid_error(
+ percent_rank(a, 1), a = 5:1,
+ error = "unused argument"
+ )
+ expect_not_hybrid_error(
+ cume_dist(a, 1), a = 5:1,
+ error = "unused argument"
+ )
+ expect_not_hybrid_error(
+ dense_rank(a, 1), a = 5:1,
+ error = "unused argument"
+ )
+ expect_not_hybrid_error(
+ ntile(a, 2, 1), a = 5:1,
+ error = "unused argument"
+ )
+
+ check_not_hybrid_result(
+ row_number("a"), a = 5:1,
+ expected = 1L
+ )
+ check_not_hybrid_result(
+ min_rank("a"), a = 5:1,
+ expected = 1L
+ )
+ check_not_hybrid_result(
+ percent_rank("a"), a = 5:1,
+ expected = is.nan
+ )
+ check_not_hybrid_result(
+ cume_dist("a"), a = 5:1,
+ expected = 1
+ )
+ check_not_hybrid_result(
+ dense_rank("a"), a = 5:1,
+ expected = 1L
+ )
+ check_not_hybrid_result(
+ ntile("a", 2), a = 5:1,
+ expected = 1L
+ )
+
+ expect_equal(
+ tibble(a = c(1, 1, 2), b = letters[1:3]) %>%
+ group_by(a) %>%
+ summarize(b = b[1], b = min_rank(desc(b))) %>%
+ ungroup,
+ tibble(a = c(1, 2), b = c(1L, 1L))
+ )
+})
+
+test_that("hybrid handlers don't nest", {
+ check_not_hybrid_result(
+ mean(lag(a)), a = 1:5,
+ expected = NA_real_
+ )
+ check_not_hybrid_result(
+ mean(row_number()), a = 1:5,
+ expected = 3,
+ test_eval = FALSE
+ )
+ check_not_hybrid_result(
+ list(lag(cume_dist(a))), a = 1:4,
+ expected = list(c(NA, 0.25, 0.5, 0.75))
+ )
+})
+
+test_that("constant folding and argument matching in hybrid evaluator (#2299)", {
+ skip("Currently failing")
+ skip("Currently failing (external var)")
+ c <- 1:3
+ check_not_hybrid_result(
+ n_distinct(c), a = 1:5,
+ expected = 3L, test_eval = FALSE
+ )
+ check_not_hybrid_result(
+ n_distinct(a, c), a = 1:3,
+ expected = 3L, test_eval = FALSE
+ )
+ check_not_hybrid_result(
+ n_distinct(a, b, na.rm = 1), a = rep(1L, 3), b = c(1, 1, NA),
+ expected = 1L
+ )
+
+ skip("Currently failing (constfold)")
+ check_hybrid_result(
+ list(a %in% 1:3), a = 2:4,
+ expected = list(c(TRUE, TRUE, FALSE))
+ )
+ check_hybrid_result(
+ list(a %in% as.numeric(1:3)), a = as.numeric(2:4),
+ expected = list(c(TRUE, TRUE, FALSE))
+ )
+ check_hybrid_result(
+ list(a %in% letters[1:3]), a = letters[2:4],
+ expected = list(c(TRUE, TRUE, FALSE))
+ )
+ check_hybrid_result(
+ list(a %in% c(TRUE, FALSE)), a = c(TRUE, FALSE, NA),
+ expected = list(c(TRUE, TRUE, FALSE))
+ )
+
+ skip("Currently failing")
+ check_hybrid_result(
+ list(a %in% NA_integer_), a = c(2:4, NA),
+ expected = list(c(FALSE, FALSE, FALSE, TRUE))
+ )
+ check_hybrid_result(
+ list(a %in% NA_real_), a = as.numeric(c(2:4, NA)),
+ expected = list(c(FALSE, FALSE, FALSE, TRUE))
+ )
+ check_hybrid_result(
+ list(a %in% NA_character_), a = c(letters[2:4], NA),
+ expected = list(c(FALSE, FALSE, FALSE, TRUE))
+ )
+ check_hybrid_result(
+ list(a %in% NA), a = c(TRUE, FALSE, NA),
+ expected = list(c(FALSE, FALSE, TRUE))
+ )
+
+ skip("Currently failing (constfold)")
+ check_hybrid_result(
+ min(a, na.rm = (1 == 0)), a = c(1:5, NA),
+ expected = NA_integer_
+ )
+ check_hybrid_result(
+ max(a, na.rm = (1 == 0)), a = c(1:5, NA),
+ expected = NA_integer_
+ )
+ check_hybrid_result(
+ min(a, na.rm = (1 == 1)), a = c(1:5, NA),
+ expected = 1L
+ )
+ check_hybrid_result(
+ max(a, na.rm = (1 == 1)), a = c(1:5, NA),
+ expected = 5L
+ )
+
+ check_hybrid_result(
+ min(a, na.rm = pi != pi), a = c(1:5, NA),
+ expected = NA_integer_
+ )
+ check_hybrid_result(
+ max(a, na.rm = pi != pi), a = c(1:5, NA),
+ expected = NA_integer_
+ )
+ check_hybrid_result(
+ min(a, na.rm = pi == pi), a = c(1:5, NA),
+ expected = 1L
+ )
+ check_hybrid_result(
+ max(a, na.rm = pi == pi), a = c(1:5, NA),
+ expected = 5L
+ )
+
+ skip("Currently failing")
+ check_hybrid_result(
+ min(a, na.rm = F), a = c(1:5, NA),
+ expected = NA_integer_
+ )
+ check_hybrid_result(
+ max(a, na.rm = F), a = c(1:5, NA),
+ expected = NA_integer_
+ )
+ check_hybrid_result(
+ min(a, na.rm = T), a = c(1:5, NA),
+ expected = 1L
+ )
+ check_hybrid_result(
+ max(a, na.rm = T), a = c(1:5, NA),
+ expected = 5L
+ )
+
+ skip("Currently failing (constfold)")
+ check_hybrid_result(
+ nth(a, 1 + 2), a = letters[1:5],
+ expected = "c"
+ )
+
+ check_hybrid_result(
+ nth(a, -4), a = 1:5,
+ expected = 2L
+ )
+
+ skip("Currently failing (constfold)")
+ check_hybrid_result(
+ list(lead(a, 1L + 2L)), a = 1:5,
+ expected = list(c(4:5, NA, NA, NA))
+ )
+ check_hybrid_result(
+ list(lag(a, 4L - 2L)), a = as.numeric(1:5),
+ expected = list(c(NA, NA, as.numeric(1:3)))
+ )
+
+ check_not_hybrid_result(
+ list(lead(a, default = 2 + 4)), a = 1:5,
+ expected = list(as.numeric(2:6))
+ )
+ check_not_hybrid_result(
+ list(lag(a, default = 3L - 3L)), a = as.numeric(1:5),
+ expected = list(as.numeric(0:4))
+ )
+
+ check_hybrid_result(
+ list(lead(a, 1 + 2)), a = 1:5,
+ expected = list(c(4:5, NA, NA, NA))
+ )
+ check_hybrid_result(
+ list(lag(a, 4 - 2)), a = as.numeric(1:5),
+ expected = list(c(NA, NA, as.numeric(1:3)))
+ )
+
+ check_hybrid_result(
+ list(lead(a, default = 2L + 4L)), a = 1:5,
+ expected = list(2:6)
+ )
+ check_hybrid_result(
+ list(lag(a, default = 3L - 3L)), a = 1:5,
+ expected = list(0:4)
+ )
+
+ check_hybrid_result(
+ list(lead(a, def = 2L + 4L)), a = 1:5,
+ expected = list(2:6)
+ )
+ check_hybrid_result(
+ list(lag(a, def = 3L - 3L)), a = 1:5,
+ expected = list(0:4)
+ )
+
+ check_hybrid_result(
+ list(lead(a, 2, 2L + 4L)), a = 1:5,
+ expected = list(c(3:6, 6L))
+ )
+ check_hybrid_result(
+ list(lag(a, 3, 3L - 3L)), a = 1:5,
+ expected = list(c(0L, 0L, 0:2))
+ )
+
+ skip("Currently failing")
+ check_hybrid_result(
+ mean(a, na.rm = (1 == 0)), a = c(1:5, NA),
+ expected = NA_real_
+ )
+ check_hybrid_result(
+ var(a, na.rm = (1 == 0)), a = c(1:3, NA),
+ expected = NA_real_
+ )
+ check_hybrid_result(
+ sd(a, na.rm = (1 == 0)), a = c(1:3, NA),
+ expected = NA_real_
+ )
+ check_hybrid_result(
+ sum(a, na.rm = (1 == 0)), a = c(1:5, NA),
+ expected = NA_integer_
+ )
+ check_hybrid_result(
+ sum(a, na.rm = (1 == 0)), a = c(as.numeric(1:5), NA),
+ expected = NA_real_
+ )
+
+ check_hybrid_result(
+ mean(a, na.rm = (1 == 1)), a = c(1:5, NA),
+ expected = 3
+ )
+ check_hybrid_result(
+ var(a, na.rm = (1 == 1)), a = c(1:3, NA),
+ expected = 1
+ )
+ check_hybrid_result(
+ sd(a, na.rm = (1 == 1)), a = c(1:3, NA),
+ expected = 1
+ )
+ check_hybrid_result(
+ sum(a, na.rm = (1 == 1)), a = c(1:5, NA),
+ expected = 15L
+ )
+ check_hybrid_result(
+ sum(a, na.rm = (1 == 1)), a = c(as.numeric(1:5), NA),
+ expected = 15
+ )
+
+ check_hybrid_result(
+ mean(na.rm = (1 == 1), a), a = c(1:5, NA),
+ expected = 3
+ )
+ check_hybrid_result(
+ var(na.rm = (1 == 1), a), a = c(1:3, NA),
+ expected = 1
+ )
+ check_hybrid_result(
+ sd(na.rm = (1 == 1), a), a = c(1:3, NA),
+ expected = 1
+ )
+ check_hybrid_result(
+ sum(na.rm = (1 == 1), a), a = c(1:5, NA),
+ expected = 15L
+ )
+ check_hybrid_result(
+ sum(na.rm = (1 == 1), a), a = c(as.numeric(1:5), NA),
+ expected = 15
+ )
+
+ skip("Currently failing (constfold)")
+ check_hybrid_result(
+ list(ntile(a, 1 + 2)), a = c(1, 3, 2, 3, 1),
+ expected = list(c(1L, 2L, 2L, 3L, 1L))
+ )
+ check_hybrid_result(
+ list(ntile(a, 1L + 2L)), a = c(1, 3, 2, 3, 1),
+ expected = list(c(1L, 2L, 2L, 3L, 1L))
+ )
+ check_hybrid_result(
+ list(ntile(n = 1 + 2, a)), a = c(1, 3, 2, 3, 1),
+ expected = list(c(1L, 2L, 2L, 3L, 1L))
+ )
+
+ skip("Currently failing")
+ df <- data_frame(x = c(NA, 1L, 2L, NA, 3L, 4L, NA))
+ expected <- rep(4L, nrow(df))
+
+ expect_equal(mutate(df, y = last(na.omit(x)))$y, expected)
+ expect_equal(mutate(df, y = last(x[!is.na(x)]))$y, expected)
+ expect_equal(mutate(df, y = x %>% na.omit() %>% last())$y, expected)
+ expect_equal(mutate(df, y = x %>% na.omit %>% last)$y, expected)
+
+ data_frame(x = c(1, 1)) %>%
+ mutate(y = 1) %>%
+ summarise(z = first(x, order_by = y))
+})
+
+test_that("simple handlers supports quosured symbols", {
+ mean <- sum <- var <- sd <- bad_hybrid_handler
+
+ expect_identical(
+ pull(summarise(mtcars, mean(!! quo(cyl)))),
+ base::mean(mtcars$cyl)
+ )
+ expect_identical(
+ pull(summarise(mtcars, sum(!! quo(cyl)))),
+ base::sum(mtcars$cyl)
+ )
+ expect_identical(
+ pull(summarise(mtcars, sd(!! quo(cyl)))),
+ stats::sd(mtcars$cyl)
+ )
+ expect_identical(
+ pull(summarise(mtcars, var(!! quo(cyl)))),
+ stats::var(mtcars$cyl)
+ )
+})
+
+test_that("%in% handler supports quosured symbols", {
+ `%in%` <- bad_hybrid_handler
+ expect_identical(
+ pull(mutate(mtcars, UQ(quo(cyl)) %in% 4)),
+ base::`%in%`(mtcars$cyl, 4)
+ )
+})
+
+test_that("min() and max() handlers supports quosured symbols", {
+ min <- max <- bad_hybrid_handler
+
+ expect_identical(
+ pull(summarise(mtcars, min(!! quo(cyl)))),
+ base::min(mtcars$cyl)
+ )
+ expect_identical(
+ pull(summarise(mtcars, max(!! quo(cyl)))),
+ base::max(mtcars$cyl)
+ )
+})
+
+test_that("lead/lag handlers support quosured symbols", {
+ lead <- lag <- bad_hybrid_handler
+
+ expect_identical(
+ pull(mutate(mtcars, lead(!! quo(cyl)))),
+ dplyr::lead(mtcars$cyl)
+ )
+ expect_identical(
+ pull(mutate(mtcars, lag(!! quo(cyl)))),
+ dplyr::lag(mtcars$cyl)
+ )
+})
+
+test_that("window handlers supports quosured symbols", {
+ ntile <- min_rank <- percent_rank <- dense_rank <- cume_dist <- bad_hybrid_handler
+
+ expect_identical(
+ pull(mutate(mtcars, ntile(!! quo(disp), 2))),
+ dplyr::ntile(mtcars$disp, 2)
+ )
+ expect_identical(
+ pull(mutate(mtcars, min_rank(!! quo(cyl)))),
+ dplyr::min_rank(mtcars$cyl)
+ )
+ expect_identical(
+ pull(mutate(mtcars, percent_rank(!! quo(cyl)))),
+ dplyr::percent_rank(mtcars$cyl)
+ )
+ expect_identical(
+ pull(mutate(mtcars, dense_rank(!! quo(cyl)))),
+ dplyr::dense_rank(mtcars$cyl)
+ )
+ expect_identical(
+ pull(mutate(mtcars, cume_dist(!! quo(cyl)))),
+ dplyr::cume_dist(mtcars$cyl)
+ )
+})
+
+test_that("n_distinct() handler supports quosured symbols", {
+ n_distinct <- bad_hybrid_handler
+
+ expect_identical(
+ pull(summarise(mtcars, n_distinct(!! quo(cyl)))),
+ dplyr::n_distinct(mtcars$cyl)
+ )
+})
+
+test_that("nth handlers support quosured symbols", {
+ first <- last <- nth <- bad_hybrid_handler
+
+ expect_identical(
+ pull(summarise(mtcars, first(!! quo(cyl)))),
+ dplyr::first(mtcars$cyl)
+ )
+ expect_identical(
+ pull(summarise(mtcars, last(!! quo(cyl)))),
+ dplyr::last(mtcars$cyl)
+ )
+ expect_identical(
+ pull(summarise(mtcars, nth(!! quo(cyl), 2))),
+ dplyr::nth(mtcars$cyl, 2)
+ )
+})
+
+test_that("top_n() is hybridised (#2822)", {
+ min_rank <- bad_hybrid_handler
+ expect_error(top_n(mtcars, 1, cyl), NA)
+})
diff --git a/tests/testthat/test-if-else.R b/tests/testthat/test-if-else.R
index 4a54ad5..a1df9d5 100644
--- a/tests/testthat/test-if-else.R
+++ b/tests/testthat/test-if-else.R
@@ -1,20 +1,40 @@
context("if_else")
test_that("first argument must be logical", {
- expect_error(if_else(1:10, 1, 2), "must be logical")
+ expect_error(
+ if_else(1:10, 1, 2),
+ "`condition` must be a logical, not integer",
+ fixed = TRUE
+ )
})
test_that("true and false must be same length as condition (or length 1)", {
- expect_error(if_else(1:3 < 2, 1:2, 1:3), "`true` is length 2 not 1 or 3")
- expect_error(if_else(1:3 < 2, 1:3, 1:2), "`false` is length 2 not 1 or 3")
+ expect_error(
+ if_else(1:3 < 2, 1:2, 1:3),
+ "`true` must be length 3 (length of `condition`) or one, not 2",
+ fixed = TRUE
+ )
+ expect_error(
+ if_else(1:3 < 2, 1:3, 1:2),
+ "`false` must be length 3 (length of `condition`) or one, not 2",
+ fixed = TRUE
+ )
})
test_that("true and false must be same type and same class", {
- expect_error(if_else(1:3 < 2, 1, 1L), "`false` has type 'integer'")
+ expect_error(
+ if_else(1:3 < 2, 1, 1L),
+ "`false` must be type double, not integer",
+ fixed = TRUE
+ )
x <- factor("x")
y <- ordered("x")
- expect_error(if_else(1:3 < 2, x, y), "`false` has class ordered/factor")
+ expect_error(
+ if_else(1:3 < 2, x, y),
+ "`false` must be factor, not ordered/factor",
+ fixed = TRUE
+ )
})
test_that("scalar true and false are vectorised", {
@@ -42,10 +62,74 @@ test_that("works with lists", {
)
})
-test_that("all forms of if translated to case statement", {
- expected <- sql('CASE WHEN ("x") THEN (1) ELSE (2) END')
+test_that("better factor support (#2197)", {
+ skip("Currently failing")
+
+ test_that("gives proper error messages for factor class (#2197)", {
+ x <- factor(1:3, labels = letters[1:3])
+
+ expect_error(
+ if_else(x == "a", "b", x),
+ "asdf",
+ fixed = TRUE
+ )
+ expect_error(
+ if_else(x == "a", 1L, x),
+ "asdf",
+ fixed = TRUE
+ )
+ expect_error(
+ if_else(x == "a", 1., x),
+ "asdf",
+ fixed = TRUE
+ )
+ expect_error(
+ if_else(x == "a", TRUE, x),
+ "asdf",
+ fixed = TRUE
+ )
+ expect_error(
+ if_else(x == "a", Sys.Date(), x),
+ "asdf",
+ fixed = TRUE
+ )
+
+ expect_error(
+ if_else(x == "a", x, "b"),
+ "asdf",
+ fixed = TRUE
+ )
+ expect_error(
+ if_else(x == "a", x, 1L),
+ "asdf",
+ fixed = TRUE
+ )
+ expect_error(
+ if_else(x == "a", x, 1.),
+ "asdf",
+ fixed = TRUE
+ )
+ expect_error(
+ if_else(x == "a", x, TRUE),
+ "asdf",
+ fixed = TRUE
+ )
+ expect_error(
+ if_else(x == "a", x, Sys.Date()),
+ "asdf",
+ fixed = TRUE
+ )
+ })
+
+ test_that("works with factors as both `true` and `false` (#2197)", {
+ x <- factor(1:3, labels = letters[1:3])
+ y <- factor(1:3, labels = letters[c(1, 2, 4)])
+
+ expect_equal(if_else(x == "a", x[[2]], x), x[c(2, 2, 3)])
- expect_equal(translate_sql(if (x) 1L else 2L), expected)
- expect_equal(translate_sql(ifelse(x, 1L, 2L)), expected)
- expect_equal(translate_sql(if_else(x, 1L, 2L)), expected)
+ expect_error(
+ if_else(x == "a", x, y),
+ "asdf levels in `false` don't match levels in `true`"
+ )
+ })
})
diff --git a/tests/testthat/test-internals.r b/tests/testthat/test-internals.r
index 76eb9d4..f77aa8a 100644
--- a/tests/testthat/test-internals.r
+++ b/tests/testthat/test-internals.r
@@ -2,6 +2,15 @@ context("internals")
test_that("comparisons<REALSXP> works as expected (#275)", {
res <- test_comparisons()
- expect_true( all(res) )
+ expect_true(all(res))
})
+test_that("join_match() works as expected", {
+ res <- test_matches()
+ expect_true(all(unlist(res)))
+})
+
+test_that("wrapping of length values works as expected", {
+ res <- test_length_wrap()
+ expect_true(all(res))
+})
diff --git a/tests/testthat/test-joins.r b/tests/testthat/test-joins.r
index 9f51059..d594ffb 100644
--- a/tests/testthat/test-joins.r
+++ b/tests/testthat/test-joins.r
@@ -151,23 +151,88 @@ test_that("can control suffixes with suffix argument", {
expect_named(j4, c("x", "z1", "z2"))
})
+test_that("can handle empty string in suffix argument, left side (#2228, #2182, #2007)", {
+ j1 <- inner_join(e, f, "x", suffix = c("", "2"))
+ j2 <- left_join(e, f, "x", suffix = c("", "2"))
+ j3 <- right_join(e, f, "x", suffix = c("", "2"))
+ j4 <- full_join(e, f, "x", suffix = c("", "2"))
+
+ expect_named(j1, c("x", "z", "z2"))
+ expect_named(j2, c("x", "z", "z2"))
+ expect_named(j3, c("x", "z", "z2"))
+ expect_named(j4, c("x", "z", "z2"))
+})
+
+test_that("can handle empty string in suffix argument, right side (#2228, #2182, #2007)", {
+ j1 <- inner_join(e, f, "x", suffix = c("1", ""))
+ j2 <- left_join(e, f, "x", suffix = c("1", ""))
+ j3 <- right_join(e, f, "x", suffix = c("1", ""))
+ j4 <- full_join(e, f, "x", suffix = c("1", ""))
+
+ expect_named(j1, c("x", "z1", "z"))
+ expect_named(j2, c("x", "z1", "z"))
+ expect_named(j3, c("x", "z1", "z"))
+ expect_named(j4, c("x", "z1", "z"))
+})
+
+test_that("disallow empty string in both sides of suffix argument (#2228)", {
+ expect_error(
+ inner_join(e, f, "x", suffix = c("", "")),
+ "`suffix` can't be empty string for both `x` and `y` suffixes",
+ fixed = TRUE
+ )
+ expect_error(
+ left_join(e, f, "x", suffix = c("", "")),
+ "`suffix` can't be empty string for both `x` and `y` suffixes",
+ fixed = TRUE
+ )
+ expect_error(
+ right_join(e, f, "x", suffix = c("", "")),
+ "`suffix` can't be empty string for both `x` and `y` suffixes",
+ fixed = TRUE
+ )
+ expect_error(
+ full_join(e, f, "x", suffix = c("", "")),
+ "`suffix` can't be empty string for both `x` and `y` suffixes",
+ fixed = TRUE
+ )
+})
+
+test_that("check suffix input", {
+ expect_error(
+ inner_join(e, f, "x", suffix = letters[1:3]),
+ "`suffix` must be a character vector of length 2, not character of length 3",
+ fixed = TRUE
+ )
+ expect_error(
+ inner_join(e, f, "x", suffix = letters[1]),
+ "`suffix` must be a character vector of length 2, not string of length 1",
+ fixed = TRUE
+ )
+ expect_error(
+ inner_join(e, f, "x", suffix = 1:2),
+ "`suffix` must be a character vector of length 2, not integer of length 2",
+ fixed = TRUE
+ )
+})
+
test_that("inner_join does not segfault on NA in factors (#306)", {
- a <- data.frame(x=c("p", "q", NA), y=c(1, 2, 3), stringsAsFactors=TRUE)
- b <- data.frame(x=c("p", "q", "r"), z=c(4,5,6), stringsAsFactors=TRUE)
+ a <- data.frame(x = c("p", "q", NA), y = c(1, 2, 3), stringsAsFactors = TRUE)
+ b <- data.frame(x = c("p", "q", "r"), z = c(4, 5, 6), stringsAsFactors = TRUE)
expect_warning(res <- inner_join(a, b, "x"), "joining factors with different levels")
- expect_equal( nrow(res), 2L )
+ expect_equal(nrow(res), 2L)
})
test_that("joins don't reorder columns #328", {
- a <- data.frame(a=1:3)
- b <- data.frame(a=1:3, b=1, c=2, d=3, e=4, f=5)
+ a <- data.frame(a = 1:3)
+ b <- data.frame(a = 1:3, b = 1, c = 2, d = 3, e = 4, f = 5)
res <- left_join(a, b, "a")
- expect_equal( names(res), names(b) )
+ expect_equal(names(res), names(b))
})
test_that("join handles type promotions #123", {
df <- data.frame(
- V1 = c(rep("a",5), rep("b",5)),
+ V1 = c(rep("a", 5), rep("b", 5)),
V2 = rep(c(1:5), 2),
V3 = c(101:110),
stringsAsFactors = FALSE
@@ -179,31 +244,40 @@ test_that("join handles type promotions #123", {
stringsAsFactors = FALSE
)
res <- semi_join(df, match, c("V1", "V2"))
- expect_equal( res$V2, 3:4 )
- expect_equal( res$V3, c(103L, 109L) )
+ expect_equal(res$V2, 3:4)
+ expect_equal(res$V3, c(103L, 109L))
})
-test_that("indices don't get mixed up when nrow(x) > nrow(y). #365",{
+test_that("indices don't get mixed up when nrow(x) > nrow(y). #365", {
a <- data.frame(V1 = c(0, 1, 2), V2 = c("a", "b", "c"), stringsAsFactors = FALSE)
b <- data.frame(V1 = c(0, 1), V3 = c("n", "m"), stringsAsFactors = FALSE)
res <- inner_join(a, b, by = "V1")
- expect_equal( res$V1, c(0,1) )
- expect_equal( res$V2, c("a", "b"))
- expect_equal( res$V3, c("n", "m"))
+ expect_equal(res$V1, c(0, 1))
+ expect_equal(res$V2, c("a", "b"))
+ expect_equal(res$V3, c("n", "m"))
})
test_that("join functions error on column not found #371", {
expect_error(
- left_join(data.frame(x=1:5), data.frame(y=1:5), by="x"),
- "column not found in lhs"
+ left_join(data.frame(x = 1:5), data.frame(y = 1:5), by = "x"),
+ "`by` can't contain join column `x` which is missing from RHS",
+ fixed = TRUE
)
expect_error(
- left_join(data.frame(x=1:5), data.frame(y=1:5), by="y"),
- "column not found in rhs"
+ left_join(data.frame(x = 1:5), data.frame(y = 1:5), by = "y"),
+ "`by` can't contain join column `y` which is missing from LHS",
+ fixed = TRUE
)
expect_error(
- left_join(data.frame(x=1:5), data.frame(y=1:5)),
- "No common variables"
+ left_join(data.frame(x = 1:5), data.frame(y = 1:5)),
+ "`by` required, because the data sources have no common variables",
+ fixed = TRUE
+ )
+
+ expect_error(
+ left_join(data.frame(x = 1:5), data.frame(y = 1:5), by = 1:3),
+ "`by` must be a (named) character vector, list, or NULL for natural joins (not recommended in production code), not integer",
+ fixed = TRUE
)
})
@@ -211,8 +285,8 @@ test_that("inner_join is symmetric (even when joining on character & factor)", {
foo <- data_frame(id = factor(c("a", "b")), var1 = "foo")
bar <- data_frame(id = c("a", "b"), var2 = "bar")
- expect_warning(tmp1 <- inner_join(foo, bar, by="id"), "joining factor and character")
- expect_warning(tmp2 <- inner_join(bar, foo, by="id"), "joining character vector and factor")
+ expect_warning(tmp1 <- inner_join(foo, bar, by = "id"), "joining factor and character")
+ expect_warning(tmp2 <- inner_join(bar, foo, by = "id"), "joining character vector and factor")
expect_is(tmp1$id, "character")
expect_is(tmp2$id, "character")
@@ -227,8 +301,8 @@ test_that("inner_join is symmetric, even when type of join var is different (#45
foo <- tbl_df(data.frame(id = 1:10, var1 = "foo"))
bar <- tbl_df(data.frame(id = as.numeric(rep(1:10, 5)), var2 = "bar"))
- tmp1 <- inner_join(foo, bar, by="id")
- tmp2 <- inner_join(bar, foo, by="id")
+ tmp1 <- inner_join(foo, bar, by = "id")
+ tmp2 <- inner_join(bar, foo, by = "id")
expect_equal(names(tmp1), c("id", "var1", "var2"))
expect_equal(names(tmp2), c("id", "var2", "var1"))
@@ -236,18 +310,18 @@ test_that("inner_join is symmetric, even when type of join var is different (#45
expect_equal(tmp1, tmp2)
})
-test_that("left_join by different variable names (#617)",{
+test_that("left_join by different variable names (#617)", {
x <- data_frame(x1 = c(1, 3, 2))
y <- data_frame(y1 = c(1, 2, 3), y2 = c("foo", "foo", "bar"))
res <- left_join(x, y, by = c("x1" = "y1"))
- expect_equal(names(res), c("x1", "y2" ) )
- expect_equal(res$x1, c(1,3,2))
+ expect_equal(names(res), c("x1", "y2"))
+ expect_equal(res$x1, c(1, 3, 2))
expect_equal(res$y2, c("foo", "bar", "foo"))
})
-test_that("joins support comple vectors" ,{
- a <- data.frame(x = c(1, 1, 2, 3)*1i, y = 1:4)
- b <- data.frame(x = c(1, 2, 2, 4)*1i, z = 1:4)
+test_that("joins support comple vectors", {
+ a <- data.frame(x = c(1, 1, 2, 3) * 1i, y = 1:4)
+ b <- data.frame(x = c(1, 2, 2, 4) * 1i, z = 1:4)
j <- inner_join(a, b, "x")
expect_equal(names(j), c("x", "y", "z"))
@@ -255,114 +329,140 @@ test_that("joins support comple vectors" ,{
expect_equal(j$z, c(1, 1, 2, 3))
})
-test_that("joins suffix variable names (#655)" ,{
- a <- data.frame(x=1:10,y=2:11)
- b <- data.frame(z=5:14,x=3:12) # x from this gets suffixed by .y
- res <- left_join(a,b,by=c('x'='z'))
- expect_equal(names(res), c("x", "y", "x.y" ) )
+test_that("joins suffix variable names (#655)", {
+ a <- data.frame(x = 1:10, y = 2:11)
+ b <- data.frame(z = 5:14, x = 3:12) # x from this gets suffixed by .y
+ res <- left_join(a, b, by = c("x" = "z"))
+ expect_equal(names(res), c("x", "y", "x.y"))
- a <- data.frame(x=1:10,z=2:11)
- b <- data.frame(z=5:14,x=3:12) # x from this gets suffixed by .y
- res <- left_join(a,b,by=c('x'='z'))
+ a <- data.frame(x = 1:10, z = 2:11)
+ b <- data.frame(z = 5:14, x = 3:12) # x from this gets suffixed by .y
+ res <- left_join(a, b, by = c("x" = "z"))
})
test_that("right_join gets the column in the right order #96", {
- a <- data.frame(x=1:10,y=2:11)
- b <- data.frame(x=5:14,z=3:12)
- res <- right_join(a,b)
+ a <- data.frame(x = 1:10, y = 2:11)
+ b <- data.frame(x = 5:14, z = 3:12)
+ res <- right_join(a, b)
expect_equal(names(res), c("x", "y", "z"))
- a <- data.frame(x=1:10,y=2:11)
- b <- data.frame(z=5:14,a=3:12)
- res <- right_join(a,b, by= c("x"="z"))
+ a <- data.frame(x = 1:10, y = 2:11)
+ b <- data.frame(z = 5:14, a = 3:12)
+ res <- right_join(a, b, by = c("x" = "z"))
expect_equal(names(res), c("x", "y", "a"))
})
-test_that("full_join #96",{
- a <- data.frame(x=1:3,y=2:4)
- b <- data.frame(x=3:5,z=3:5)
- res <- full_join(a,b, "x")
+test_that("full_join #96", {
+ a <- data.frame(x = 1:3, y = 2:4)
+ b <- data.frame(x = 3:5, z = 3:5)
+ res <- full_join(a, b, "x")
expect_equal(res$x, 1:5)
expect_equal(res$y[1:3], 2:4)
- expect_true( all(is.na(res$y[4:5]) ))
+ expect_true(all(is.na(res$y[4:5])))
- expect_true( all(is.na(res$z[1:2]) ))
- expect_equal( res$z[3:5], 3:5 )
+ expect_true(all(is.na(res$z[1:2])))
+ expect_equal(res$z[3:5], 3:5)
})
test_that("JoinStringFactorVisitor and JoinFactorStringVisitor handle NA #688", {
- x <- data.frame(Greek = c("Alpha", "Beta", NA), numbers= 1:3)
- y <- data.frame(Greek = c("Alpha", "Beta", "Gamma"),
- Letters = c("C", "B", "C"), stringsAsFactors = F)
-
- expect_warning(res <- left_join(x, y, by = "Greek"), "joining character vector")
- expect_true( is.na(res$Greek[3]) )
- expect_true( is.na(res$Letters[3]) )
- expect_equal( res$numbers, 1:3 )
+ x <- data.frame(Greek = c("Alpha", "Beta", NA), numbers = 1:3)
+ y <- data.frame(
+ Greek = c("Alpha", "Beta", "Gamma"),
+ Letters = c("C", "B", "C"),
+ stringsAsFactors = F
+ )
- expect_warning(res <- left_join(y, x, by="Greek" ), "joining factor")
- expect_equal( res$Greek, y$Greek)
- expect_equal( res$Letters, y$Letters )
- expect_equal( res$numbers[1:2], 1:2 )
- expect_true( is.na(res$numbers[3]) )
+ expect_warning(
+ res <- left_join(x, y, by = "Greek"),
+ "Column `Greek` joining factor and character vector, coercing into character vector",
+ fixed = TRUE
+ )
+ expect_true(is.na(res$Greek[3]))
+ expect_true(is.na(res$Letters[3]))
+ expect_equal(res$numbers, 1:3)
+
+ expect_warning(
+ res <- left_join(y, x, by = "Greek"),
+ "Column `Greek` joining character vector and factor, coercing into character vector",
+ fixed = TRUE
+ )
+ expect_equal(res$Greek, y$Greek)
+ expect_equal(res$Letters, y$Letters)
+ expect_equal(res$numbers[1:2], 1:2)
+ expect_true(is.na(res$numbers[3]))
})
-test_that("JoinFactorFactorVisitor_SameLevels preserve levels order (#675)",{
- input <- data.frame(g1 = factor(c('A','B','C'), levels = c('B','A','C')))
+test_that("JoinFactorFactorVisitor_SameLevels preserve levels order (#675)", {
+ input <- data.frame(g1 = factor(c("A", "B", "C"), levels = c("B", "A", "C")))
output <- data.frame(
- g1 = factor(c('A','B','C'), levels = c('B','A','C')),
- g2 = factor(c('A','B','C'), levels = c('B','A','C'))
+ g1 = factor(c("A", "B", "C"), levels = c("B", "A", "C")),
+ g2 = factor(c("A", "B", "C"), levels = c("B", "A", "C"))
)
res <- inner_join(group_by(input, g1), group_by(output, g1))
- expect_equal( levels(res$g1), levels(input$g1))
- expect_equal( levels(res$g2), levels(output$g2))
+ expect_equal(levels(res$g1), levels(input$g1))
+ expect_equal(levels(res$g2), levels(output$g2))
})
test_that("inner_join does not reorder (#684)", {
test <- data_frame(Greek = c("Alpha", "Beta", "Gamma"), Letters = LETTERS[1:3])
lookup <- data_frame(Letters = c("C", "B", "C"))
res <- inner_join(lookup, test)
- expect_equal( res$Letters, c("C", "B", "C" ) )
+ expect_equal(res$Letters, c("C", "B", "C"))
})
test_that("joins coerce factors with different levels to character (#684)", {
- d1 <- data_frame( a = factor( c("a", "b", "c" ) ) )
- d2 <- data_frame( a = factor( c("a", "e" ) ) )
- expect_warning( { res <- inner_join( d1, d2 ) })
- expect_is( res$a, "character" )
+ d1 <- data_frame(a = factor(c("a", "b", "c")))
+ d2 <- data_frame(a = factor(c("a", "e")))
+ expect_warning(res <- inner_join(d1, d2))
+ expect_is(res$a, "character")
# different orders
d2 <- d1
- attr( d2$a, "levels" ) <- c("c", "b", "a" )
- expect_warning( { res <- inner_join( d1, d2 ) })
- expect_is( res$a, "character" )
+ attr(d2$a, "levels") <- c("c", "b", "a")
+ expect_warning(res <- inner_join(d1, d2))
+ expect_is(res$a, "character")
})
test_that("joins between factor and character coerces to character with a warning (#684)", {
- d1 <- data_frame( a = factor( c("a", "b", "c" ) ) )
- d2 <- data_frame( a = c("a", "e" ) )
- expect_warning( { res <- inner_join( d1, d2 ) })
- expect_is( res$a, "character" )
+ d1 <- data_frame(a = factor(c("a", "b", "c")))
+ d2 <- data_frame(a = c("a", "e"))
+ expect_warning(res <- inner_join(d1, d2))
+ expect_is(res$a, "character")
- expect_warning( { res <- inner_join( d2, d1 ) })
- expect_is( res$a, "character" )
+ expect_warning(res <- inner_join(d2, d1))
+ expect_is(res$a, "character")
})
+test_that("group column names reflect renamed duplicate columns (#2330)", {
+ d1 <- data_frame(x = 1:5, y = 1:5) %>% group_by(x, y)
+ d2 <- data_frame(x = 1:5, y = 1:5)
+ res <- inner_join(d1, d2, by = "x")
+ expect_groups(d1, c("x", "y"))
+ expect_groups(res, c("x", "y.x"))
+})
+
+test_that("group column names are null when joined data frames are not grouped (#2330)", {
+ d1 <- data_frame(x = 1:5, y = 1:5)
+ d2 <- data_frame(x = 1:5, y = 1:5)
+ res <- inner_join(d1, d2, by = "x")
+ expect_no_groups(res)
+})
+
# Guessing variables in x and y ------------------------------------------------
test_that("unnamed vars are the same in both tables", {
- by1 <- common_by(c("x", "y", "z"))
+ by1 <- common_by_from_vector(c("x", "y", "z"))
expect_equal(by1$x, c("x", "y", "z"))
expect_equal(by1$y, c("x", "y", "z"))
- by2 <- common_by(c("x" = "a", "y", "z"))
+ by2 <- common_by_from_vector(c("x" = "a", "y", "z"))
expect_equal(by2$x, c("x", "y", "z"))
expect_equal(by2$y, c("a", "y", "z"))
})
@@ -376,41 +476,43 @@ test_that("join columns are not moved to the left (#802)", {
})
test_that("join can handle multiple encodings (#769)", {
- x <- data_frame(name=c("\xC9lise","Pierre","Fran\xE7ois"),score=c(5,7,6))
- y <- data_frame(name=c("\xC9lise","Pierre","Fran\xE7ois"),attendance=c(8,10,9))
+ text <- c("\xC9lise", "Pierre", "Fran\xE7ois")
+ Encoding(text) <- "latin1"
+ x <- data_frame(name = text, score = c(5, 7, 6))
+ y <- data_frame(name = text, attendance = c(8, 10, 9))
res <- left_join(x, y, by = "name")
- expect_equal( nrow(res), 3L)
- expect_equal( res$name, x$name)
+ expect_equal(nrow(res), 3L)
+ expect_equal(res$name, x$name)
- x <- data_frame(name=factor(c("\xC9lise","Pierre","Fran\xE7ois")),score=c(5,7,6))
- y <- data_frame(name=c("\xC9lise","Pierre","Fran\xE7ois"),attendance=c(8,10,9))
- res <- suppressWarnings( left_join(x, y, by = "name") )
- expect_equal( nrow(res), 3L)
- expect_equal( res$name, y$name)
+ x <- data_frame(name = factor(text), score = c(5, 7, 6))
+ y <- data_frame(name = text, attendance = c(8, 10, 9))
+ res <- suppressWarnings(left_join(x, y, by = "name"))
+ expect_equal(nrow(res), 3L)
+ expect_equal(res$name, y$name)
- x <- data_frame(name=c("\xC9lise","Pierre","Fran\xE7ois"),score=c(5,7,6))
- y <- data_frame(name=factor(c("\xC9lise","Pierre","Fran\xE7ois")),attendance=c(8,10,9))
- res <- suppressWarnings( left_join(x, y, by = "name") )
- expect_equal( nrow(res), 3L)
- expect_equal( res$name, x$name)
+ x <- data_frame(name = text, score = c(5, 7, 6))
+ y <- data_frame(name = factor(text), attendance = c(8, 10, 9))
+ res <- suppressWarnings(left_join(x, y, by = "name"))
+ expect_equal(nrow(res), 3L)
+ expect_equal(res$name, x$name)
- x <- data_frame(name=factor(c("\xC9lise","Fran\xE7ois","Pierre")),score=c(5,7,6))
- y <- data_frame(name=factor(c("\xC9lise","Pierre","Fran\xE7ois")),attendance=c(8,10,9))
- res <- suppressWarnings( left_join(x, y, by = "name") )
- expect_equal( nrow(res), 3L)
- expect_equal( res$name, x$name)
+ x <- data_frame(name = factor(text), score = c(5, 7, 6))
+ y <- data_frame(name = factor(text), attendance = c(8, 10, 9))
+ res <- suppressWarnings(left_join(x, y, by = "name"))
+ expect_equal(nrow(res), 3L)
+ expect_equal(res$name, x$name)
})
test_that("join creates correctly named results (#855)", {
- x <- data.frame(q=c("a","b","c"),r=c("d","e","f"),s=c("1","2","3"))
- y <- data.frame(q=c("a","b","c"),r=c("d","e","f"),t=c("xxx","xxx","xxx"))
- res <- left_join(x,y,by=c("r","q"))
- expect_equal(names(res), c("q", "r", "s", "t") )
+ x <- data.frame(q = c("a", "b", "c"), r = c("d", "e", "f"), s = c("1", "2", "3"))
+ y <- data.frame(q = c("a", "b", "c"), r = c("d", "e", "f"), t = c("xxx", "xxx", "xxx"))
+ res <- left_join(x, y, by = c("r", "q"))
+ expect_equal(names(res), c("q", "r", "s", "t"))
expect_equal(res$q, x$q)
expect_equal(res$r, x$r)
})
-test_that("inner join gives same result as merge. #1281", {
+test_that("inner join gives same result as merge by default (#1281)", {
set.seed(75)
x <- data.frame(cat1 = sample(c("A", "B", NA), 5, 1),
cat2 = sample(c(1, 2, NA), 5, 1), v = rpois(5, 3),
@@ -418,90 +520,120 @@ test_that("inner join gives same result as merge. #1281", {
y <- data.frame(cat1 = sample(c("A", "B", NA), 5, 1),
cat2 = sample(c(1, 2, NA), 5, 1), v = rpois(5, 3),
stringsAsFactors = FALSE)
- ij <- inner_join(x, y, by = c("cat1","cat2"))
- me <- merge(x, y, by = c("cat1","cat2"))
- expect_true( equal_data_frame(ij, me) )
+ ij <- inner_join(x, y, by = c("cat1", "cat2"))
+ me <- merge(x, y, by = c("cat1", "cat2"))
+ expect_true(equal_data_frame(ij, me))
})
test_that("join handles matrices #1230", {
- df1 <- data_frame( x = 1:10, text = letters[1:10])
- df2 <- data_frame( x = 1:5, text = "" )
- df2$text <- matrix( LETTERS[1:10], nrow=5)
+ df1 <- data_frame(x = 1:10, text = letters[1:10])
+ df2 <- data_frame(x = 1:5, text = "")
+ df2$text <- matrix(LETTERS[1:10], nrow = 5)
- res <- left_join( df1, df2, by = c("x"="x")) %>% filter( x > 5)
+ res <- left_join(df1, df2, by = c("x" = "x")) %>% filter(x > 5)
text.y <- res$text.y
- expect_true( is.matrix(text.y) )
- expect_equal( dim(text.y), c(5,2) )
- expect_true( all(is.na(text.y)) )
+ expect_true(is.matrix(text.y))
+ expect_equal(dim(text.y), c(5, 2))
+ expect_true(all(is.na(text.y)))
})
-test_that( "ordering of strings is not confused by R's collate order (#1315)", {
- a= data.frame(character = c("\u0663"),set=c("arabic_the_language"),stringsAsFactors=F)
- b = data.frame(character = c("3"),set=c("arabic_the_numeral_set"),stringsAsFactors = F)
- res <- b %>% inner_join(a,by=c("character"))
- expect_equal( nrow(res), 0L)
- res <- a %>% inner_join(b,by=c("character"))
- expect_equal( nrow(res), 0L)
+test_that("ordering of strings is not confused by R's collate order (#1315)", {
+ a = data.frame(character = c("\u0663"), set = c("arabic_the_language"), stringsAsFactors = F)
+ b = data.frame(character = c("3"), set = c("arabic_the_numeral_set"), stringsAsFactors = F)
+ res <- b %>% inner_join(a, by = c("character"))
+ expect_equal(nrow(res), 0L)
+ res <- a %>% inner_join(b, by = c("character"))
+ expect_equal(nrow(res), 0L)
})
test_that("joins handle tzone differences (#819)", {
date1 <- structure(-1735660800, tzone = "America/Chicago", class = c("POSIXct", "POSIXt"))
date2 <- structure(-1735660800, tzone = "UTC", class = c("POSIXct", "POSIXt"))
- df1 <- data.frame( date = date1 )
- df2 <- data.frame( date = date2 )
+ df1 <- data.frame(date = date1)
+ df2 <- data.frame(date = date2)
- expect_equal( attr(left_join(df1, df1)$date, "tzone"), "America/Chicago" )
+ expect_equal(attr(left_join(df1, df1)$date, "tzone"), "America/Chicago")
})
-test_that("joins matches NA in character vector (#892)", {
- x <- data.frame(id = c(NA_character_, NA_character_),
- stringsAsFactors = F)
+test_that("joins matches NA in character vector by default (#892, #2033)", {
+ x <- data.frame(
+ id = c(NA_character_, NA_character_),
+ stringsAsFactors = F
+ )
- y <- expand.grid(id = c(NA_character_, NA_character_),
- LETTER = LETTERS[1:2],
- stringsAsFactors = F)
+ y <- expand.grid(
+ id = c(NA_character_, NA_character_),
+ LETTER = LETTERS[1:2],
+ stringsAsFactors = F
+ )
- res <- left_join(x, y, by = 'id')
- expect_true( all( is.na(res$id)) )
- expect_equal( res$LETTER, rep(rep(c("A", "B"), each = 2), 2) )
+ res <- left_join(x, y, by = "id")
+ expect_true(all(is.na(res$id)))
+ expect_equal(res$LETTER, rep(rep(c("A", "B"), each = 2), 2))
})
-test_that( "joins avoid name repetition (#1460)", {
- d1 <- data.frame(id=1:5, foo=rnorm(5))
- d2 <- data.frame(id=1:5, foo=rnorm(5))
- d3 <- data.frame(id=1:5, foo=rnorm(5))
- d <- d1 %>% left_join(d1, by="id") %>%
- left_join(d2, by="id") %>%
- left_join(d3, by="id")
- expect_equal( names(d), c("id", "foo.x", "foo.y", "foo.x.x", "foo.y.y"))
+test_that("joins avoid name repetition (#1460)", {
+ d1 <- data.frame(id = 1:5, foo = rnorm(5))
+ d2 <- data.frame(id = 1:5, foo = rnorm(5))
+ d3 <- data.frame(id = 1:5, foo = rnorm(5))
+ d <- d1 %>% left_join(d1, by = "id") %>%
+ left_join(d2, by = "id") %>%
+ left_join(d3, by = "id")
+ expect_equal(names(d), c("id", "foo.x", "foo.y", "foo.x.x", "foo.y.y"))
})
test_that("join functions are protected against empty by (#1496)", {
x <- data.frame()
- y <- data.frame(a=1)
- expect_error( left_join(x,y, by = names(x) ), "no variable to join by" )
- expect_error( right_join(x,y, by = names(x) ), "no variable to join by" )
- expect_error( semi_join(x,y, by = names(x) ), "no variable to join by" )
- expect_error( full_join(x,y, by = names(x) ), "no variable to join by" )
- expect_error( anti_join(x,y, by = names(x) ), "no variable to join by" )
- expect_error( inner_join(x,y, by = names(x) ), "no variable to join by" )
+ y <- data.frame(a = 1)
+ expect_error(
+ left_join(x, y, by = names(x)),
+ "`by` must specify variables to join by",
+ fixed = TRUE
+ )
+ expect_error(
+ right_join(x, y, by = names(x)),
+ "`by` must specify variables to join by",
+ fixed = TRUE
+ )
+ expect_error(
+ semi_join(x, y, by = names(x)),
+ "`by` must specify variables to join by",
+ fixed = TRUE
+ )
+ expect_error(
+ full_join(x, y, by = names(x)),
+ "`by` must specify variables to join by",
+ fixed = TRUE
+ )
+ expect_error(
+ anti_join(x, y, by = names(x)),
+ "`by` must specify variables to join by",
+ fixed = TRUE
+ )
+ expect_error(
+ inner_join(x, y, by = names(x)),
+ "`by` must specify variables to join by",
+ fixed = TRUE
+ )
})
test_that("joins takes care of duplicates in by (#1192)", {
- data2 <- data_frame(a=1:3)
- data1 <- data_frame(a=1:3, c=3:5)
+ data2 <- data_frame(a = 1:3)
+ data1 <- data_frame(a = 1:3, c = 3:5)
- res1 <- left_join(data1, data2, by=c("a","a"))
- res2 <- left_join(data1, data2, by=c("a" = "a"))
+ res1 <- left_join(data1, data2, by = c("a", "a"))
+ res2 <- left_join(data1, data2, by = c("a" = "a"))
expect_equal(res1, res2)
})
# Joined columns result in correct type ----------------------------------------
test_that("result of joining POSIXct is POSIXct (#1578)", {
- data1 <- data_frame(t=seq(as.POSIXct("2015-12-01", tz="UTC"), length.out=2,
- by="days"), x=1:2)
+ data1 <- data_frame(
+ t = seq(as.POSIXct("2015-12-01", tz = "UTC"), length.out = 2, by = "days"),
+ x = 1:2
+ )
data2 <- inner_join(data1, data1, by = "t")
res1 <- class(data2$t)
expected <- c("POSIXct", "POSIXt")
@@ -511,52 +643,60 @@ test_that("result of joining POSIXct is POSIXct (#1578)", {
test_that("joins allows extra attributes if they are identical (#1636)", {
tbl_left <- data_frame(
- i = rep(c(1.0, 2.0, 3.0), each = 2),
+ i = rep(c(1, 2, 3), each = 2),
x1 = letters[1:6]
)
tbl_right <- data_frame(
- i = c(1.0, 2.0, 3.0),
+ i = c(1, 2, 3),
x2 = letters[1:3]
)
- attr(tbl_left$i, 'label') <- 'iterator'
- attr(tbl_right$i, 'label') <- 'iterator'
+ attr(tbl_left$i, "label") <- "iterator"
+ attr(tbl_right$i, "label") <- "iterator"
- res <- left_join(tbl_left, tbl_right, by = 'i')
- expect_equal( attr(res$i, "label"), "iterator" )
+ res <- left_join(tbl_left, tbl_right, by = "i")
+ expect_equal(attr(res$i, "label"), "iterator")
- attr(tbl_left$i, "foo" ) <- "bar"
+ attr(tbl_left$i, "foo") <- "bar"
attributes(tbl_right$i) <- NULL
- attr(tbl_right$i, "foo" ) <- "bar"
- attr(tbl_right$i, 'label') <- 'iterator'
+ attr(tbl_right$i, "foo") <- "bar"
+ attr(tbl_right$i, "label") <- "iterator"
- res <- left_join(tbl_left, tbl_right, by = 'i')
- expect_equal( attr(res$i, "label"), "iterator" )
- expect_equal( attr(res$i, "foo"), "bar" )
+ res <- left_join(tbl_left, tbl_right, by = "i")
+ expect_equal(attr(res$i, "label"), "iterator")
+ expect_equal(attr(res$i, "foo"), "bar")
})
test_that("joins work with factors of different levels (#1712)", {
d1 <- iris[, c("Species", "Sepal.Length")]
d2 <- iris[, c("Species", "Sepal.Width")]
- d2$Species <- factor(as.character(d2$Species), levels=rev(levels(d1$Species)))
- expect_warning( res1 <- left_join(d1, d2, by="Species") )
+ d2$Species <- factor(as.character(d2$Species), levels = rev(levels(d1$Species)))
+ expect_warning(res1 <- left_join(d1, d2, by = "Species"))
d1$Species <- as.character(d1$Species)
d2$Species <- as.character(d2$Species)
- res2 <- left_join(d1, d2, by="Species")
+ res2 <- left_join(d1, d2, by = "Species")
expect_equal(res1, res2)
})
test_that("anti and semi joins give correct result when by variable is a factor (#1571)", {
- big <- data.frame(letter = rep(c('a', 'b'), each = 2), number = 1:2)
- small <- data.frame(letter = 'b')
- expect_warning( aj_result <- anti_join(big, small, by = "letter"), NA )
- expect_equal( aj_result$number, 1:2)
- expect_equal( aj_result$letter, factor(c("a", "a"), levels = c("a", "b")) )
+ big <- data.frame(letter = rep(c("a", "b"), each = 2), number = 1:2)
+ small <- data.frame(letter = "b")
+ expect_warning(
+ aj_result <- anti_join(big, small, by = "letter"),
+ "Column `letter` joining factors with different levels, coercing to character vector",
+ fixed = TRUE
+ )
+ expect_equal(aj_result$number, 1:2)
+ expect_equal(aj_result$letter, factor(c("a", "a"), levels = c("a", "b")))
- expect_warning( sj_result <- semi_join(big, small, by = "letter"), NA )
- expect_equal( sj_result$number, 1:2)
- expect_equal( sj_result$letter, factor(c("b", "b"), levels = c("a", "b")) )
+ expect_warning(
+ sj_result <- semi_join(big, small, by = "letter"),
+ "Column `letter` joining factors with different levels, coercing to character vector",
+ fixed = TRUE
+ )
+ expect_equal(sj_result$number, 1:2)
+ expect_equal(sj_result$letter, factor(c("b", "b"), levels = c("a", "b")))
})
test_that("inner join not crashing (#1559)", {
@@ -577,23 +717,184 @@ test_that("inner join not crashing (#1559)", {
perc = c(0.15363485835208, -0.0318297270618471, 0.0466114830816894, 0.0971986553754823)
)
# all we want here is to test that this does not crash
- expect_message( res <- replicate(100, df3 %>% inner_join(df4)) )
- for( i in 2:100)
- expect_equal( res[,1], res[,i] )
+ expect_message(res <- replicate(100, df3 %>% inner_join(df4)))
+ for (i in 2:100) expect_equal(res[, 1], res[, i])
+})
+
+test_that("join handles mix of encodings in data (#1885, #2118, #2271)", {
+ with_non_utf8_encoding({
+ special <- get_native_lang_string()
+
+ for (factor1 in c(FALSE, TRUE)) {
+ for (factor2 in c(FALSE, TRUE)) {
+ for (encoder1 in c(enc2native, enc2utf8)) {
+ for (encoder2 in c(enc2native, enc2utf8)) {
+ df1 <- data.frame(x = encoder1(special), y = 1, stringsAsFactors = factor1)
+ df1 <- tbl_df(df1)
+ df2 <- data.frame(x = encoder2(special), z = 2, stringsAsFactors = factor2)
+ df2 <- tbl_df(df2)
+ df <- data.frame(x = special, y = 1, z = 2, stringsAsFactors = factor1 && factor2)
+ df <- tbl_df(df)
+
+ info <- paste(
+ factor1,
+ factor2,
+ Encoding(as.character(df1$x)),
+ Encoding(as.character(df2$x))
+ )
+
+ if (factor1 != factor2) warning_msg <- "coercing"
+ else warning_msg <- NA
+
+ expect_warning_msg <- function(code, msg = warning_msg) {
+ expect_warning(
+ code, msg,
+ info = paste(deparse(substitute(code)[[2]][[1]]), info))
+ }
+
+ expect_equal_df <- function(code, df_ = df) {
+ code <- substitute(code)
+ eval(bquote(
+ expect_equal(
+ .(code), df_,
+ info = paste(deparse(code[[1]]), info)
+ )
+ ))
+ }
+
+ expect_warning_msg(expect_equal_df(inner_join(df1, df2, by = "x")))
+ expect_warning_msg(expect_equal_df(left_join(df1, df2, by = "x")))
+ expect_warning_msg(expect_equal_df(right_join(df1, df2, by = "x")))
+ expect_warning_msg(expect_equal_df(full_join(df1, df2, by = "x")))
+ expect_warning_msg(
+ expect_equal_df(
+ semi_join(df1, df2, by = "x"),
+ data.frame(x = special, y = 1, stringsAsFactors = factor1)
+ )
+ )
+ expect_warning_msg(
+ expect_equal_df(
+ anti_join(df1, df2, by = "x"),
+ data.frame(x = special, y = 1, stringsAsFactors = factor1)[0,]
+ )
+ )
+ }
+ }
+ }
+ }
+ })
+})
+
+test_that("left_join handles mix of encodings in column names (#1571)", {
+ with_non_utf8_encoding({
+ special <- get_native_lang_string()
+
+ df1 <- data_frame(x = 1:6, foo = 1:6)
+ names(df1)[1] <- special
+
+ df2 <- data_frame(x = 1:6, baz = 1:6)
+ names(df2)[1] <- enc2native(special)
+
+ expect_message(res <- left_join(df1, df2), special, fixed = TRUE)
+ expect_equal(names(res), c(special, "foo", "baz"))
+ expect_equal(res$foo, 1:6)
+ expect_equal(res$baz, 1:6)
+ expect_equal(res[[special]], 1:6)
+ })
+})
+
+test_that("NAs match in joins only with na_matches = 'na' (#2033)", {
+ df1 <- data_frame(a = NA)
+ df2 <- data_frame(a = NA, b = 1:3)
+ for (na_matches in c("na", "never")) {
+ accept_na_match <- (na_matches == "na")
+ expect_equal(inner_join(df1, df2, na_matches = na_matches) %>% nrow, 0 + 3 * accept_na_match)
+ expect_equal(left_join(df1, df2, na_matches = na_matches) %>% nrow, 1 + 2 * accept_na_match)
+ expect_equal(right_join(df2, df1, na_matches = na_matches) %>% nrow, 1 + 2 * accept_na_match)
+ expect_equal(full_join(df1, df2, na_matches = na_matches) %>% nrow, 4 - accept_na_match)
+ expect_equal(anti_join(df1, df2, na_matches = na_matches) %>% nrow, 1 - accept_na_match)
+ expect_equal(semi_join(df1, df2, na_matches = na_matches) %>% nrow, 0 + accept_na_match)
+ }
+})
+
+test_that("joins strip group indexes (#1597)", {
+ df1 <- data_frame(a = 1:3) %>% group_by(a)
+ df2 <- data_frame(a = rep(1:4, 2)) %>% group_by(a)
+
+ expect_stripped <- function(df) {
+ expect_null(attr(df, "indices"))
+ expect_null(attr(df, "group_sizes"))
+ expect_null(attr(df, "biggest_group_size"))
+ expect_null(attr(df, "labels"))
+ }
+
+ expect_stripped(inner_join(df1, df2))
+ expect_stripped(left_join(df1, df2))
+ expect_stripped(right_join(df2, df1))
+ expect_stripped(full_join(df1, df2))
+ expect_stripped(anti_join(df1, df2))
+ expect_stripped(semi_join(df1, df2))
+})
+
+
+test_that("join accepts tz attributes (#2643)", {
+ # It's the same time:
+ df1 <- data_frame(a = as.POSIXct("2009-01-01 10:00:00", tz = "Europe/London"))
+ df2 <- data_frame(a = as.POSIXct("2009-01-01 11:00:00", tz = "Europe/Paris"))
+ result <- inner_join(df1, df2, by = "a")
+ expect_equal(nrow(result), 1)
+})
+
+test_that("join takes LHS with warning if attributes inconsistent", {
+ df1 <- tibble(a = 1:2, b = 2:1)
+ df2 <- tibble(
+ a = structure(1:2, foo = "bar"),
+ c = 2:1
+ )
+
+ expect_warning(
+ out1 <- left_join(df1, df2, by = "a"),
+ "Column `a` has different attributes on LHS and RHS of join"
+ )
+ expect_warning(out2 <- left_join(df2, df1, by = "a"))
+ expect_warning(
+ out3 <- left_join(df1, df2, by = c("b" = "a")),
+ "Column `b`/`a` has different attributes on LHS and RHS of join"
+ )
+
+ expect_equal(attr(out1$a, "foo"), NULL)
+ expect_equal(attr(out2$a, "foo"), "bar")
})
-test_that( "left_join handles mix of encodings in column names (#1571)", {
+test_that("common_by() message", {
+ df <- tibble(!!! set_names(letters, letters))
- df1 <- tibble::data_frame(x = 1:6, foo = 1:6)
- names(df1)[1] <- "l\u00f8penummer"
+ expect_message(
+ left_join(df, df %>% select(1)),
+ 'Joining, by = "a"',
+ fixed = TRUE
+ )
- df2 <- tibble::data_frame(x = 1:6, baz = 1:6)
- names(df2)[1] <- iconv( "l\u00f8penummer", from = "UTF-8", to = "latin1" )
+ expect_message(
+ left_join(df, df %>% select(1:3)),
+ 'Joining, by = c("a", "b", "c")',
+ fixed = TRUE
+ )
- expect_message( res <- left_join( df1, df2 ) )
- expect_equal( names(res), c("l\u00f8penummer", "foo", "baz") )
- expect_equal( res$foo, 1:6)
- expect_equal( res$baz, 1:6)
- expect_equal( res[["l\u00f8penummer"]], 1:6)
+ expect_message(
+ left_join(df, df),
+ paste0("Joining, by = c(", paste0('"', letters, '"', collapse = ", "), ")"),
+ fixed = TRUE
+ )
+})
+test_that("semi- and anti-joins preserve order (#2964)", {
+ expect_identical(
+ data_frame(a = 3:1) %>% semi_join(data_frame(a = 1:3)),
+ data_frame(a = 3:1)
+ )
+ expect_identical(
+ data_frame(a = 3:1) %>% anti_join(data_frame(a = 4:6)),
+ data_frame(a = 3:1)
+ )
})
diff --git a/tests/testthat/test-lazy-ops.R b/tests/testthat/test-lazy-ops.R
deleted file mode 100644
index 9c19e07..0000000
--- a/tests/testthat/test-lazy-ops.R
+++ /dev/null
@@ -1,110 +0,0 @@
-context("lazy-ops")
-
-# op_vars -----------------------------------------------------------------
-
-test_that("select reduces variables", {
- out <- mtcars %>% tbl_lazy() %>% select(mpg:disp)
- expect_equal(op_vars(out), c("mpg", "cyl", "disp"))
-})
-
-test_that("rename preserves existing", {
- out <- data_frame(x = 1, y = 2) %>% tbl_lazy() %>% rename(z = y)
- expect_equal(op_vars(out), c("x", "z"))
-})
-
-test_that("mutate adds new", {
- out <- data_frame(x = 1) %>% tbl_lazy() %>% mutate(y = x + 1, z = y + 1)
- expect_equal(op_vars(out), c("x", "y", "z"))
-})
-
-test_that("summarise replaces existing", {
- out <- data_frame(x = 1, y = 2) %>% tbl_lazy() %>% summarise(z = 1)
- expect_equal(op_vars(out), "z")
-})
-
-test_that("grouped summary keeps groups", {
- out <- data_frame(g = 1, x = 1) %>%
- tbl_lazy() %>%
- group_by(g) %>%
- summarise(y = 1)
- expect_equal(op_vars(out), c("g", "y"))
-})
-
-test_that("joins get vars from both left and right", {
- out <- left_join(
- lazy_frame(x = 1, y = 1),
- lazy_frame(x = 2, z = 2),
- by = "x"
- )
-
- expect_equal(op_vars(out), c("x", "y", "z"))
-})
-
-test_that("semi joins get vars from left", {
- out <- semi_join(
- lazy_frame(x = 1, y = 1),
- lazy_frame(x = 2, z = 2),
- by = "x"
- )
-
- expect_equal(op_vars(out), c("x", "y"))
-})
-
-
-# op_grps -----------------------------------------------------------------
-
-test_that("group_by overrides existing groups", {
- df <- data_frame(g1 = 1, g2 = 2, x = 3) %>% tbl_lazy()
-
- out1 <- df %>% group_by(g1)
- expect_equal(op_grps(out1), "g1")
-
- out2 <- out1 %>% group_by(g2)
- expect_equal(op_grps(out2), "g2")
-})
-
-test_that("group_by increases grouping if add = TRUE", {
- df <- data_frame(g1 = 1, g2 = 2, x = 3) %>% tbl_lazy()
-
- out <- df %>% group_by(g1) %>% group_by(g2, add = TRUE)
- expect_equal(op_grps(out), c("g1", "g2"))
-})
-
-test_that("summarise drops one grouping level", {
- df <- data_frame(g1 = 1, g2 = 2, x = 3) %>% tbl_lazy() %>% group_by(g1, g2)
- out1 <- df %>% summarise(y = 1)
- out2 <- out1 %>% summarise(y = 2)
-
- expect_equal(op_grps(out1), "g1")
- expect_equal(op_grps(out2), NULL)
-})
-
-test_that("ungroup drops all groups", {
- out <- lazy_frame(g1 = 1, g2 = 2) %>%
- group_by(g1, g2) %>%
- ungroup()
-
- expect_equal(op_grps(out), NULL)
-})
-
-# op_sort -----------------------------------------------------------------
-
-test_that("unsorted gives NULL", {
- out <- lazy_frame(x = 1:3, y = 3:1)
- expect_equal(op_sort(out), NULL)
-})
-
-test_that("arranges captures DESC", {
- out <- lazy_frame(x = 1:3, y = 3:1) %>%
- arrange(desc(x))
-
- expect_equal(op_sort(out), sql('"x" DESC'))
-})
-
-test_that("multiple arranges combine", {
- out <- lazy_frame(x = 1:3, y = 3:1) %>%
- arrange(x) %>%
- arrange(y)
-
- expect_equal(op_sort(out), sql('"x"', '"y"'))
-})
diff --git a/tests/testthat/test-lazyeval-compat.R b/tests/testthat/test-lazyeval-compat.R
new file mode 100644
index 0000000..a094c96
--- /dev/null
+++ b/tests/testthat/test-lazyeval-compat.R
@@ -0,0 +1,27 @@
+context("lazyeval compatibility")
+
+test_that("can select negatively (#2519)", {
+ expect_identical(select_(mtcars, ~-cyl), mtcars[-2])
+})
+
+test_that("select yields proper names", {
+ expect_identical(names(select_(mtcars, ~cyl:hp)), c("cyl", "disp", "hp"))
+})
+
+test_that("lazydots are named and arrange() doesn't fail (it assumes empty names)", {
+ dots <- compat_lazy_dots(list(), env(), "cyl")
+ expect_identical(names(dots), "")
+ expect_identical(arrange_(mtcars, "cyl"), arrange(mtcars, cyl))
+})
+
+test_that("mutate_each_() and summarise_each_() handle lazydots", {
+ cyl_chr <- mutate_each_(mtcars, funs(as.character), "cyl")$cyl
+ expect_identical(cyl_chr, as.character(mtcars$cyl))
+
+ cyl_mean <- summarise_each_(mtcars, funs(mean), "cyl")$cyl
+ expect_equal(cyl_mean, mean(mtcars$cyl))
+})
+
+test_that("select_vars_() handles lazydots", {
+ expect_identical(select_vars_(letters, c("a", "b")), set_names(c("a", "b")))
+})
diff --git a/tests/testthat/test-lead-lag.R b/tests/testthat/test-lead-lag.R
index 069d29e..2e450a1 100644
--- a/tests/testthat/test-lead-lag.R
+++ b/tests/testthat/test-lead-lag.R
@@ -19,24 +19,62 @@ test_that("lead and lag preserves dates and times", {
})
test_that("#925 is fixed", {
- data <- data_frame(
- name = c("Rob", "Pete", "Rob", "John", "Rob", "Pete", "John", "Pete", "John", "Pete", "Rob", "Rob"),
+ data <- data_frame(
+ name = c("Rob", "Pete", "Rob", "John", "Rob", "Pete", "John", "Pete", "John", "Pete", "Rob", "Rob"),
time = c(3, 2, 5, 3, 2, 3, 2, 4, 1, 1, 4, 1)
)
- res <- data %>% group_by(name) %>% mutate( lag_time = lag(time) )
- expect_equal( res$lag_time[ res$name == "Rob" ] , c(NA, head( data$time[data$name == "Rob"] , -1 ) ) )
- expect_equal( res$lag_time[ res$name == "Pete" ], c(NA, head( data$time[data$name == "Pete"], -1 ) ) )
- expect_equal( res$lag_time[ res$name == "John" ], c(NA, head( data$time[data$name == "John"], -1 ) ) )
+ res <- data %>% group_by(name) %>% mutate(lag_time = lag(time))
+ expect_equal(
+ res$lag_time[res$name == "Rob"],
+ c(NA, head(data$time[data$name == "Rob"], -1))
+ )
+ expect_equal(
+ res$lag_time[res$name == "Pete"],
+ c(NA, head(data$time[data$name == "Pete"], -1))
+ )
+ expect_equal(
+ res$lag_time[res$name == "John"],
+ c(NA, head(data$time[data$name == "John"], -1))
+ )
})
test_that("#937 is fixed", {
df <- data_frame(
- name=rep(c('Al','Jen'),3),
- score=rep(c(100, 80, 60),2)
+ name = rep(c("Al", "Jen"), 3),
+ score = rep(c(100, 80, 60), 2)
)
- res <- df %>% group_by(name) %>% mutate( next.score = lead(score) )
- expect_equal( res$next.score[ res$name == "Al" ] , c(tail( df$score[df$name == "Al"] , -1 ), NA ) )
- expect_equal( res$next.score[ res$name == "Jen" ] , c(tail( df$score[df$name == "Jen"] , -1 ), NA ) )
+ res <- df %>% group_by(name) %>% mutate(next.score = lead(score))
+ expect_equal(
+ res$next.score[res$name == "Al"],
+ c(tail(df$score[df$name == "Al"], -1), NA)
+ )
+ expect_equal(
+ res$next.score[res$name == "Jen"],
+ c(tail(df$score[df$name == "Jen"], -1), NA)
+ )
})
+test_that("input checks", {
+ expect_error(
+ lead(letters, -1),
+ "`n` must be a nonnegative integer scalar, not double of length 1",
+ fixed = TRUE
+ )
+ expect_error(
+ lead(letters, "1"),
+ "`n` must be a nonnegative integer scalar, not string of length 1",
+ fixed = TRUE
+ )
+
+ expect_error(
+ lag(letters, -1),
+ "`n` must be a nonnegative integer scalar, not double of length 1",
+ fixed = TRUE
+ )
+ expect_error(
+ lag(letters, "1"),
+ "`n` must be a nonnegative integer scalar, not string of length 1",
+ fixed = TRUE
+ )
+})
diff --git a/tests/testthat/test-mutate-windowed.R b/tests/testthat/test-mutate-windowed.R
index a3b4ad2..8c1e301 100644
--- a/tests/testthat/test-mutate-windowed.R
+++ b/tests/testthat/test-mutate-windowed.R
@@ -1,170 +1,127 @@
context("Mutate - windowed")
-test_that("mutate calls windowed versions of sql functions", {
- test_f <- function(tbl) {
- res <- tbl %>%
- group_by(g) %>%
- mutate(r = as.numeric(row_number(x))) %>%
- collect()
- expect_equal(res$r, c(1, 2, 1, 2))
- }
-
- df <- data_frame(x = 1:4, g = rep(c(1, 2), each = 2))
- tbls <- test_load(df, ignore = "sqlite") # SQLite doesn't support window functions
- tbls %>% lapply(test_f)
-})
-
-test_that("recycled aggregates generate window function", {
- test_f <- function(tbl) {
- res <- tbl %>%
- group_by(g) %>%
- mutate(r = x > mean(x)) %>%
- collect()
- expect_equal(res$r, c(FALSE, TRUE, FALSE, TRUE))
- }
-
- df <- data_frame(x = 1:4, g = rep(c(1, 2), each = 2))
- tbls <- test_load(df, ignore = "sqlite") # SQLite doesn't support window functions
- tbls %>% lapply(test_f)
-})
-
-test_that("cumulative aggregates generate window function", {
- test_f <- function(tbl) {
- res <- tbl %>%
- group_by(g) %>%
- arrange(x) %>%
- mutate(r = cumsum(x)) %>%
- collect()
- expect_equal(res$r, c(1, 3, 3, 7))
- }
-
- df <- data_frame(x = 1:4, g = rep(c(1, 2), each = 2))
- tbls <- test_load(df, ignore = "sqlite") # SQLite doesn't support window functions
- tbls %>% lapply(test_f)
-})
-
test_that("desc is correctly handled by window functions", {
- df <- data.frame(x = 1:10, y = seq(1,10,by=1), g = rep(c(1, 2), each = 5))
+ df <- data.frame(x = 1:10, y = seq(1, 10, by = 1), g = rep(c(1, 2), each = 5))
- expect_equal(mutate(df, rank=min_rank(desc(x)) )$rank, 10:1 )
- expect_equal(mutate(group_by(df,g), rank=min_rank(desc(x)))$rank, rep(5:1,2) )
+ expect_equal(mutate(df, rank = min_rank(desc(x)))$rank, 10:1)
+ expect_equal(mutate(group_by(df, g), rank = min_rank(desc(x)))$rank, rep(5:1, 2))
- expect_equal(mutate(df, rank=row_number(desc(x)) )$rank, 10:1 )
- expect_equal(mutate(group_by(df,g), rank=row_number(desc(x)))$rank, rep(5:1,2) )
+ expect_equal(mutate(df, rank = row_number(desc(x)))$rank, 10:1)
+ expect_equal(mutate(group_by(df, g), rank = row_number(desc(x)))$rank, rep(5:1, 2))
})
-test_that("row_number gives correct results",{
- tmp <- data.frame(id=rep(c(1,2),each=5),value=c(1,1,2,5,0,6,4,0,0,2))
- res <- group_by(tmp, id) %>% mutate(var=row_number(value))
- expect_equal(res$var, c(2,3,4,5,1,5,4,1,2,3))
+test_that("row_number gives correct results", {
+ tmp <- data.frame(id = rep(c(1, 2), each = 5), value = c(1, 1, 2, 5, 0, 6, 4, 0, 0, 2))
+ res <- group_by(tmp, id) %>% mutate(var = row_number(value))
+ expect_equal(res$var, c(2, 3, 4, 5, 1, 5, 4, 1, 2, 3))
})
test_that("row_number works with 0 arguments", {
g <- group_by(mtcars, cyl)
- expect_equal( mutate( g, rn = row_number() ), mutate( g, rn = 1:n() ) )
+ expect_equal(mutate(g, rn = row_number()), mutate(g, rn = 1:n()))
})
test_that("cum(sum,min,max) works", {
- df <- data.frame(x = 1:10, y = seq(1,10,by=1), g = rep(c(1, 2), each = 5))
+ df <- data.frame(x = 1:10, y = seq(1, 10, by = 1), g = rep(c(1, 2), each = 5))
- res <- mutate( df,
+ res <- mutate(df,
csumx = cumsum(x), csumy = cumsum(y),
cminx = cummin(x), cminy = cummin(y),
cmaxx = cummax(x), cmaxy = cummax(y)
)
- expect_equal( res$csumx, cumsum(df$x) )
- expect_equal( res$csumy, cumsum(df$y) )
- expect_equal( res$cminx, cummin(df$x) )
- expect_equal( res$cminy, cummin(df$y) )
- expect_equal( res$cmaxx, cummax(df$x) )
- expect_equal( res$cmaxy, cummax(df$y) )
-
- res <- mutate( group_by(df,g) ,
+ expect_equal(res$csumx, cumsum(df$x))
+ expect_equal(res$csumy, cumsum(df$y))
+ expect_equal(res$cminx, cummin(df$x))
+ expect_equal(res$cminy, cummin(df$y))
+ expect_equal(res$cmaxx, cummax(df$x))
+ expect_equal(res$cmaxy, cummax(df$y))
+
+ res <- mutate(group_by(df, g),
csumx = cumsum(x), csumy = cumsum(y),
cminx = cummin(x), cminy = cummin(y),
cmaxx = cummax(x), cmaxy = cummax(y)
)
- expect_equal( res$csumx, c( cumsum(df$x[1:5]), cumsum(df$x[6:10]) ) )
- expect_equal( res$csumy, c( cumsum(df$y[1:5]), cumsum(df$y[6:10]) ) )
- expect_equal( res$cminx, c( cummin(df$x[1:5]), cummin(df$x[6:10]) ) )
- expect_equal( res$cminy, c( cummin(df$y[1:5]), cummin(df$y[6:10]) ) )
- expect_equal( res$cmaxx, c( cummax(df$x[1:5]), cummax(df$x[6:10]) ) )
- expect_equal( res$cmaxy, c( cummax(df$y[1:5]), cummax(df$y[6:10]) ) )
+ expect_equal(res$csumx, c(cumsum(df$x[1:5]), cumsum(df$x[6:10])))
+ expect_equal(res$csumy, c(cumsum(df$y[1:5]), cumsum(df$y[6:10])))
+ expect_equal(res$cminx, c(cummin(df$x[1:5]), cummin(df$x[6:10])))
+ expect_equal(res$cminy, c(cummin(df$y[1:5]), cummin(df$y[6:10])))
+ expect_equal(res$cmaxx, c(cummax(df$x[1:5]), cummax(df$x[6:10])))
+ expect_equal(res$cmaxy, c(cummax(df$y[1:5]), cummax(df$y[6:10])))
df$x[3] <- NA
df$y[4] <- NA
- res <- mutate( df,
+ res <- mutate(df,
csumx = cumsum(x), csumy = cumsum(y),
cminx = cummin(x), cminy = cummin(y),
cmaxx = cummax(x), cmaxy = cummax(y)
)
- expect_true( all(is.na(res$csumx[3:10])) )
- expect_true( all(is.na(res$csumy[4:10])) )
+ expect_true(all(is.na(res$csumx[3:10])))
+ expect_true(all(is.na(res$csumy[4:10])))
- expect_true( all(is.na(res$cminx[3:10])) )
- expect_true( all(is.na(res$cminy[4:10])) )
+ expect_true(all(is.na(res$cminx[3:10])))
+ expect_true(all(is.na(res$cminy[4:10])))
- expect_true( all(is.na(res$cmaxx[3:10])) )
- expect_true( all(is.na(res$cmaxy[4:10])) )
+ expect_true(all(is.na(res$cmaxx[3:10])))
+ expect_true(all(is.na(res$cmaxy[4:10])))
})
-test_that( "lead and lag simple hybrid version gives correct results (#133)", {
+test_that("lead and lag simple hybrid version gives correct results (#133)", {
res <- group_by(mtcars, cyl) %>%
- mutate( disp_lag_2 = lag(disp, 2), disp_lead_2 = lead(disp, 2) ) %>%
+ mutate(disp_lag_2 = lag(disp, 2), disp_lead_2 = lead(disp, 2)) %>%
summarise(
- lag1 = all(is.na(head(disp_lag_2, 2))),
- lag2 = all(!is.na(tail(disp_lag_2, -2))),
-
- lead1 = all(is.na(tail(disp_lead_2, 2))),
- lead2 = all(!is.na(head(disp_lead_2, -2)))
+ lag1 = all(is.na(head(disp_lag_2, 2))),
+ lag2 = all(!is.na(tail(disp_lag_2, -2))),
+ lead1 = all(is.na(tail(disp_lead_2, 2))),
+ lead2 = all(!is.na(head(disp_lead_2, -2)))
)
- expect_true( all(res$lag1) )
- expect_true( all(res$lag2) )
- expect_true( all(res$lead1) )
- expect_true( all(res$lead2) )
+ expect_true(all(res$lag1))
+ expect_true(all(res$lag2))
+
+ expect_true(all(res$lead1))
+ expect_true(all(res$lead2))
})
test_that("min_rank handles columns full of NaN (#726)", {
test <- data.frame(
- Name = c("a", "b","c", "d", "e"),
+ Name = c("a", "b", "c", "d", "e"),
ID = c(1, 1, 1, 1, 1),
expression = c(NaN, NaN, NaN, NaN, NaN)
)
- data <- group_by(test, ID) %>% mutate(rank = min_rank(expression) )
- expect_true( all(is.na(data$rank)) )
+ data <- group_by(test, ID) %>% mutate(rank = min_rank(expression))
+ expect_true(all(is.na(data$rank)))
})
test_that("rank functions deal correctly with NA (#774)", {
- data <- data_frame( x = c(1,2,NA,1,0,NA) )
+ data <- data_frame(x = c(1, 2, NA, 1, 0, NA))
res <- data %>% mutate(
min_rank = min_rank(x),
percent_rank = percent_rank(x),
dense_rank = dense_rank(x),
cume_dist = cume_dist(x),
- ntile = ntile(x,2),
+ ntile = ntile(x, 2),
row_number = row_number(x)
)
- expect_true( all( is.na( res$min_rank[c(3,6)] ) ) )
- expect_true( all( is.na( res$dense_rank[c(3,6)] ) ) )
- expect_true( all( is.na( res$percent_rank[c(3,6)] ) ) )
- expect_true( all( is.na( res$cume_dist[c(3,6)] ) ) )
- expect_true( all( is.na( res$ntile[c(3,6)] ) ) )
- expect_true( all( is.na( res$row_number[c(3,6)] ) ) )
-
- expect_equal( res$percent_rank[ c(1,2,4,5) ], c(1/3, 1, 1/3, 0 ) )
- expect_equal( res$min_rank[ c(1,2,4,5) ], c(2L,4L,2L,1L) )
- expect_equal( res$dense_rank[ c(1,2,4,5) ], c(2L,3L,2L,1L) )
- expect_equal( res$cume_dist[ c(1,2,4,5) ], c(.75,1,.75,.25) )
- expect_equal( res$ntile[ c(1,2,4,5) ], c(1L,2L,2L,1L) )
- expect_equal( res$row_number[ c(1,2,4,5) ], c(2L,4L,3L,1L) )
+ expect_true(all(is.na(res$min_rank[c(3, 6)])))
+ expect_true(all(is.na(res$dense_rank[c(3, 6)])))
+ expect_true(all(is.na(res$percent_rank[c(3, 6)])))
+ expect_true(all(is.na(res$cume_dist[c(3, 6)])))
+ expect_true(all(is.na(res$ntile[c(3, 6)])))
+ expect_true(all(is.na(res$row_number[c(3, 6)])))
+
+ expect_equal(res$percent_rank[ c(1, 2, 4, 5) ], c(1 / 3, 1, 1 / 3, 0))
+ expect_equal(res$min_rank[ c(1, 2, 4, 5) ], c(2L, 4L, 2L, 1L))
+ expect_equal(res$dense_rank[ c(1, 2, 4, 5) ], c(2L, 3L, 2L, 1L))
+ expect_equal(res$cume_dist[ c(1, 2, 4, 5) ], c(.75, 1, .75, .25))
+ expect_equal(res$ntile[ c(1, 2, 4, 5) ], c(1L, 2L, 2L, 1L))
+ expect_equal(res$row_number[ c(1, 2, 4, 5) ], c(2L, 4L, 3L, 1L))
data <- data_frame(
- x = rep(c(1,2,NA,1,0,NA), 2),
- g = rep(c(1,2), each = 6)
+ x = rep(c(1, 2, NA, 1, 0, NA), 2),
+ g = rep(c(1, 2), each = 6)
)
res <- data %>%
group_by(g) %>%
@@ -173,27 +130,27 @@ test_that("rank functions deal correctly with NA (#774)", {
percent_rank = percent_rank(x),
dense_rank = dense_rank(x),
cume_dist = cume_dist(x),
- ntile = ntile(x,2),
+ ntile = ntile(x, 2),
row_number = row_number(x)
)
- expect_true( all( is.na( res$min_rank[c(3,6,9,12)] ) ) )
- expect_true( all( is.na( res$dense_rank[c(3,6,9,12)] ) ) )
- expect_true( all( is.na( res$percent_rank[c(3,6,9,12)] ) ) )
- expect_true( all( is.na( res$cume_dist[c(3,6,9,12)] ) ) )
- expect_true( all( is.na( res$ntile[c(3,6,9,12)] ) ) )
- expect_true( all( is.na( res$row_number[c(3,6,9,12)] ) ) )
-
- expect_equal( res$percent_rank[ c(1,2,4,5,7,8,10,11) ], rep(c(1/3, 1, 1/3, 0 ), 2) )
- expect_equal( res$min_rank[ c(1,2,4,5,7,8,10,11) ], rep(c(2L,4L,2L,1L), 2) )
- expect_equal( res$dense_rank[ c(1,2,4,5,7,8,10,11) ], rep(c(2L,3L,2L,1L), 2) )
- expect_equal( res$cume_dist[ c(1,2,4,5,7,8,10,11) ], rep(c(.75,1,.75,.25), 2) )
- expect_equal( res$ntile[ c(1,2,4,5,7,8,10,11) ], rep(c(1L,2L,2L,1L), 2) )
- expect_equal( res$row_number[ c(1,2,4,5,7,8,10,11) ], rep(c(2L,4L,3L,1L), 2 ) )
+ expect_true(all(is.na(res$min_rank[c(3, 6, 9, 12)])))
+ expect_true(all(is.na(res$dense_rank[c(3, 6, 9, 12)])))
+ expect_true(all(is.na(res$percent_rank[c(3, 6, 9, 12)])))
+ expect_true(all(is.na(res$cume_dist[c(3, 6, 9, 12)])))
+ expect_true(all(is.na(res$ntile[c(3, 6, 9, 12)])))
+ expect_true(all(is.na(res$row_number[c(3, 6, 9, 12)])))
+
+ expect_equal(res$percent_rank[ c(1, 2, 4, 5, 7, 8, 10, 11) ], rep(c(1 / 3, 1, 1 / 3, 0), 2))
+ expect_equal(res$min_rank[ c(1, 2, 4, 5, 7, 8, 10, 11) ], rep(c(2L, 4L, 2L, 1L), 2))
+ expect_equal(res$dense_rank[ c(1, 2, 4, 5, 7, 8, 10, 11) ], rep(c(2L, 3L, 2L, 1L), 2))
+ expect_equal(res$cume_dist[ c(1, 2, 4, 5, 7, 8, 10, 11) ], rep(c(.75, 1, .75, .25), 2))
+ expect_equal(res$ntile[ c(1, 2, 4, 5, 7, 8, 10, 11) ], rep(c(1L, 2L, 2L, 1L), 2))
+ expect_equal(res$row_number[ c(1, 2, 4, 5, 7, 8, 10, 11) ], rep(c(2L, 4L, 3L, 1L), 2))
})
test_that("lag and lead work on factors inside mutate (#955)", {
- test_factor <- factor(rep(c('A','B','C'), each = 3))
+ test_factor <- factor(rep(c("A", "B", "C"), each = 3))
exp_lag <- test_factor != lag(test_factor)
exp_lead <- test_factor != lead(test_factor)
@@ -202,23 +159,23 @@ test_that("lag and lead work on factors inside mutate (#955)", {
is_diff_lag = (test != lag(test)),
is_diff_lead = (test != lead(test))
)
- expect_equal( exp_lag , res$is_diff_lag )
- expect_equal( exp_lead, res$is_diff_lead)
+ expect_equal(exp_lag , res$is_diff_lag)
+ expect_equal(exp_lead, res$is_diff_lead)
})
test_that("lag handles default argument in mutate (#915)", {
- blah <- data.frame(x1 = c(5,10,20,27,35,58,5,6), y = 8:1)
+ blah <- data.frame(x1 = c(5, 10, 20, 27, 35, 58, 5, 6), y = 8:1)
blah <- mutate(blah,
- x2 = x1 - lag(x1, n=1, default=0),
- x3 = x1 - lead(x1, n=1, default=0),
- x4 = lag(x1, n=1L, order_by = y),
- x5 = lead(x1, n=1L, order_by = y)
+ x2 = x1 - lag(x1, n = 1, default = 0),
+ x3 = x1 - lead(x1, n = 1, default = 0),
+ x4 = lag(x1, n = 1L, order_by = y),
+ x5 = lead(x1, n = 1L, order_by = y)
)
- expect_equal( blah$x2, blah$x1 - lag(blah$x1, n = 1, default = 0))
- expect_equal( blah$x3, blah$x1 - lead(blah$x1, n = 1, default = 0))
- expect_equal( blah$x4, lag( blah$x1, n=1L, order_by = blah$y ) )
- expect_equal( blah$x5, lead( blah$x1, n=1L, order_by = blah$y ) )
+ expect_equal(blah$x2, blah$x1 - lag(blah$x1, n = 1, default = 0))
+ expect_equal(blah$x3, blah$x1 - lead(blah$x1, n = 1, default = 0))
+ expect_equal(blah$x4, lag(blah$x1, n = 1L, order_by = blah$y))
+ expect_equal(blah$x5, lead(blah$x1, n = 1L, order_by = blah$y))
})
# FIXME: this should only fail if strict checking is on.
@@ -230,3 +187,15 @@ test_that("lag handles default argument in mutate (#915)", {
# expect_error(df_sqlite %>% mutate(x > mean(x)), "does not support")
# expect_error(df_sqlite %>% mutate(r = row_number()), "does not support")
# })
+
+test_that("dim attribute is stripped from grouped mutate (#1918)", {
+ df <- data.frame(a = 1:3, b = 1:3)
+
+ df_regular <- mutate(df, b = scale(b))
+ df_grouped <- mutate(group_by(df, a), b = scale(b))
+ df_rowwise <- mutate(rowwise(df), b = scale(b))
+
+ expect_null(dim(df$b))
+ expect_null(dim(df_grouped$b))
+ expect_null(dim(df_rowwise$b))
+})
diff --git a/tests/testthat/test-mutate.r b/tests/testthat/test-mutate.r
index 7da1201..e16b93f 100644
--- a/tests/testthat/test-mutate.r
+++ b/tests/testthat/test-mutate.r
@@ -21,16 +21,18 @@ test_that("repeated outputs applied progressively (grouped_df)", {
expect_equal(out$z, c(3L, 3L))
})
-df <- data.frame(x = 1:10, y = 6:15)
-tbls <- test_load(df)
-
test_that("two mutates equivalent to one", {
- compare_tbls(tbls, function(tbl) tbl %>% mutate(x2 = x * 2, y4 = y * 4))
+ df <- tibble(x = 1:10, y = 6:15)
+
+ df1 <- df %>% mutate(x2 = x * 2, y4 = y * 4)
+ df2 <- df %>% mutate(x2 = x * 2) %>% mutate(y4 = y * 4)
+
+ expect_equal(df1, df2)
})
test_that("mutate can refer to variables that were just created (#140)", {
res <- mutate(tbl_df(mtcars), cyl1 = cyl + 1, cyl2 = cyl1 + 1)
- expect_equal(res$cyl2, mtcars$cyl+2)
+ expect_equal(res$cyl2, mtcars$cyl + 2)
gmtcars <- group_by(tbl_df(mtcars), am)
res <- mutate(gmtcars, cyl1 = cyl + 1, cyl2 = cyl1 + 1)
@@ -41,20 +43,23 @@ test_that("mutate can refer to variables that were just created (#140)", {
test_that("mutate handles logical result (#141)", {
x <- data.frame(x = 1:10, g = rep(c(1, 2), each = 5))
res <- tbl_df(x) %>% group_by(g) %>% mutate(r = x > mean(x))
- expect_equal(res$r, rep(c(FALSE,FALSE,FALSE,TRUE,TRUE), 2))
+ expect_equal(res$r, rep(c(FALSE, FALSE, FALSE, TRUE, TRUE), 2))
})
test_that("mutate can rename variables (#137)", {
res <- mutate(tbl_df(mtcars), cyl2 = cyl)
expect_equal(res$cyl2, mtcars$cyl)
- res <- mutate(group_by(tbl_df(mtcars), am) , cyl2 = cyl)
+ res <- mutate(group_by(tbl_df(mtcars), am), cyl2 = cyl)
expect_equal(res$cyl2, res$cyl)
})
test_that("mutate refuses to modify grouping vars (#143)", {
- expect_error(mutate(group_by(tbl_df(mtcars), am) , am = am + 2),
- "cannot modify grouping variable")
+ expect_error(
+ mutate(group_by(tbl_df(mtcars), am), am = am + 2),
+ "Column `am` can't be modified because it's a grouping variable",
+ fixed = TRUE
+ )
})
test_that("mutate handles constants (#152)", {
@@ -64,35 +69,48 @@ test_that("mutate handles constants (#152)", {
test_that("mutate fails with wrong result size (#152)", {
df <- group_by(data.frame(x = c(2, 2, 3, 3)), x)
- expect_equal(mutate(df, y = 1:2)$y, rep(1:2,2))
- expect_error(mutate( mtcars, zz = 1:2 ), "wrong result size" )
+ expect_equal(mutate(df, y = 1:2)$y, rep(1:2, 2))
+ expect_error(
+ mutate(mtcars, zz = 1:2),
+ "Column `zz` must be length 32 (the number of rows) or one, not 2",
+ fixed = TRUE
+ )
df <- group_by(data.frame(x = c(2, 2, 3, 3, 3)), x)
- expect_error(mutate(df, y = 1:2))
+ expect_error(
+ mutate(df, y = 1:2),
+ "Column `y` must be length 3 (the group size) or one, not 2",
+ fixed = TRUE
+ )
})
test_that("mutate refuses to use symbols not from the data", {
y <- 1:6
df <- group_by(data.frame(x = c(1, 2, 2, 3, 3, 3)), x)
- expect_error(mutate( df, z = y ))
+ expect_error(
+ mutate(df, z = y),
+ "Column `z` must be length 1 (the group size), not 6",
+ fixed = TRUE
+ )
})
test_that("mutate recycles results of length 1", {
df <- data.frame(x = c(2, 2, 3, 3))
- expect_equal(mutate(tbl_df(df), z = length(x) )$z, rep(4,4))
- expect_equal(mutate(group_by(df,x), z = length(x) )$z, rep(2,4))
+ expect_equal(mutate(tbl_df(df), z = length(x))$z, rep(4, 4))
+ expect_equal(mutate(group_by(df, x), z = length(x))$z, rep(2, 4))
int <- 1L
str <- "foo"
num <- 1
bool <- TRUE
+ list <- list(NULL)
- res <- mutate(group_by(df,x),
- int = int, str = str, num = num, bool = bool)
- expect_equal(res$int , rep(int ,4))
- expect_equal(res$str , rep(str ,4))
- expect_equal(res$num , rep(num ,4))
- expect_equal(res$bool, rep(bool,4))
+ res <- mutate(group_by(df, x), int = int, str = str, num = num, bool = bool, list = list)
+ expect_equal(res$int , rep(int , 4))
+ expect_equal(res$str , rep(str , 4))
+ expect_equal(res$num , rep(num , 4))
+ expect_equal(res$bool, rep(bool, 4))
+ expect_equal(res$list, rep(list, 4))
})
@@ -100,77 +118,90 @@ test_that("mutate handles out of data variables", {
today <- Sys.Date()
now <- Sys.time()
df <- data.frame(x = c(2, 2, 3, 3))
- gdf <- group_by(df,x)
+ gdf <- group_by(df, x)
- int <- c(1L,2L)
+ int <- c(1L, 2L)
str <- c("foo", "bar")
- num <- c(1,2)
- bool <- c(TRUE,FALSE)
- dat <- rep(today,2)
- tim <- rep(now,2)
-
- res <- mutate(gdf, int = int, str = str, num = num, bool = bool,
- dat = dat, tim = tim)
- expect_equal(res$int , rep(int ,2))
- expect_equal(res$str , rep(str ,2))
- expect_equal(res$num , rep(num ,2))
- expect_equal(res$bool, rep(bool,2))
- expect_equal(res$dat, rep(dat,2))
- expect_equal(res$tim, rep(tim,2))
+ num <- c(1, 2)
+ bool <- c(TRUE, FALSE)
+ dat <- rep(today, 2)
+ tim <- rep(now, 2)
+
+ res <- mutate(
+ gdf,
+ int = int, str = str, num = num, bool = bool, dat = dat, tim = tim
+ )
+ expect_equal(res$int , rep(int , 2))
+ expect_equal(res$str , rep(str , 2))
+ expect_equal(res$num , rep(num , 2))
+ expect_equal(res$bool, rep(bool, 2))
+ expect_equal(res$dat , rep(dat , 2))
+ expect_equal(res$tim , rep(tim , 2))
int <- 1:6
- expect_error(mutate(gdf, int = int))
- expect_error(mutate(tbl_df(df), int = int))
+ expect_error(
+ mutate(gdf, int = int),
+ "Column `int` must be length 2 (the group size) or one, not 6",
+ fixed = TRUE
+ )
+ expect_error(
+ mutate(tbl_df(df), int = int),
+ "Column `int` must be length 4 (the number of rows) or one, not 6",
+ fixed = TRUE
+ )
int <- 1:4
- str <- rep(c("foo", "bar"), 2 )
- num <- c(1,2,3,4)
- bool <- c(TRUE,FALSE,FALSE,TRUE)
- dat <- rep(today,4)
- tim <- rep(now,4)
-
- res <- mutate(tbl_df(df), int = int, str = str, num = num, bool = bool, tim = tim, dat=dat)
- expect_equal(res$int , int )
- expect_equal(res$str , str )
- expect_equal(res$num , num )
- expect_equal(res$bool, bool )
- expect_equal(res$dat , dat )
- expect_equal(res$tim , tim )
+ str <- rep(c("foo", "bar"), 2)
+ num <- c(1, 2, 3, 4)
+ bool <- c(TRUE, FALSE, FALSE, TRUE)
+ dat <- rep(today, 4)
+ tim <- rep(now, 4)
+
+ res <- mutate(
+ tbl_df(df),
+ int = int, str = str, num = num, bool = bool, tim = tim, dat = dat
+ )
+ expect_equal(res$int , int)
+ expect_equal(res$str , str)
+ expect_equal(res$num , num)
+ expect_equal(res$bool, bool)
+ expect_equal(res$dat , dat)
+ expect_equal(res$tim , tim)
})
test_that("mutate handles passing ...", {
- df <- data.frame( x = 1:4 )
+ df <- data.frame(x = 1:4)
- f <- function(...){
+ f <- function(...) {
x1 <- 1
f1 <- function(x) x
- mutate(df, ..., x1 = f1(x1) )
+ mutate(df, ..., x1 = f1(x1))
}
- g <- function(...){
+ g <- function(...) {
x2 <- 2
f(x2 = x2, ...)
}
- h <- function(before = "before", ..., after = "after"){
- g(before = before, ..., after = after )
+ h <- function(before = "before", ..., after = "after") {
+ g(before = before, ..., after = after)
}
- res <- h( x3 = 3 )
- expect_equal(res$x1, rep(1,4) )
- expect_equal(res$x2, rep(2,4) )
+ res <- h(x3 = 3)
+ expect_equal(res$x1, rep(1, 4))
+ expect_equal(res$x2, rep(2, 4))
expect_equal(res$before, rep("before", 4))
expect_equal(res$after, rep("after", 4))
df <- tbl_df(df)
- res <- h( x3 = 3 )
- expect_equal(res$x1, rep(1,4) )
- expect_equal(res$x2, rep(2,4) )
+ res <- h(x3 = 3)
+ expect_equal(res$x1, rep(1, 4))
+ expect_equal(res$x2, rep(2, 4))
expect_equal(res$before, rep("before", 4))
expect_equal(res$after, rep("after", 4))
df <- group_by(df, x)
- res <- h( x3 = 3 )
- expect_equal(res$x1, rep(1,4) )
- expect_equal(res$x2, rep(2,4) )
+ res <- h(x3 = 3)
+ expect_equal(res$x1, rep(1, 4))
+ expect_equal(res$x2, rep(2, 4))
expect_equal(res$before, rep("before", 4))
expect_equal(res$after, rep("after", 4))
@@ -178,10 +209,21 @@ test_that("mutate handles passing ...", {
test_that("mutate fails on unsupported column type", {
df <- data.frame(created = c("2014/1/1", "2014/1/2", "2014/1/2"))
- expect_error(mutate(df, date = strptime(created, "%Y/%m/%d")) )
+ expect_error(
+ mutate(df, date = strptime(created, "%Y/%m/%d")),
+ "Column `date` is of unsupported class POSIXlt",
+ fixed = TRUE
+ )
- df <- data.frame(created = c("2014/1/1", "2014/1/2", "2014/1/2"), g = c(1,1,2) )
- expect_error(mutate(group_by(df,g), date = strptime(created, "%Y/%m/%d")) )
+ df <- data.frame(
+ created = c("2014/1/1", "2014/1/2", "2014/1/2"),
+ g = c(1, 1, 2)
+ )
+ expect_error(
+ mutate(group_by(df, g), date = strptime(created, "%Y/%m/%d")),
+ "Column `date` is of unsupported class POSIXlt",
+ fixed = TRUE
+ )
})
test_that("mutate modifies same column repeatedly (#243)", {
@@ -189,13 +231,20 @@ test_that("mutate modifies same column repeatedly (#243)", {
expect_equal(mutate(df, x = x + 1, x = x + 1)$x, 3)
})
-test_that("mutate errors when results are not compatible accross groups (#299)",{
+test_that("mutate errors when results are not compatible accross groups (#299)", {
d <- data.frame(x = rep(1:5, each = 3))
- expect_error(mutate(group_by(d,x),val = ifelse(x < 3, "foo", 2)))
+ expect_error(
+ mutate(group_by(d, x), val = ifelse(x < 3, "foo", 2)),
+ "Column `val` can't be converted from character to numeric",
+ fixed = TRUE
+ )
})
-test_that("assignments are forbidden (#315)", {
- expect_error(mutate(mtcars, cyl2 = { x <- cyl^2; -x } ))
+test_that("assignments don't overwrite variables (#315)", {
+ expect_equal(
+ mutate(mtcars, cyl2 = { mpg <- cyl ^ 2; -mpg }),
+ mutate(mtcars, cyl2 = -cyl ^ 2)
+ )
})
test_that("hybrid evaluator uses correct environment (#403)", {
@@ -204,23 +253,64 @@ test_that("hybrid evaluator uses correct environment (#403)", {
mutate(mtcars, xx = func2(mpg / sum(mpg)))
}
res <- func1()
- expect_equal(res$xx, rep(0,nrow(res)) )
+ expect_equal(res$xx, rep(0, nrow(res)))
})
test_that("mutate remove variables with = NULL syntax (#462)", {
data <- mtcars %>% mutate(cyl = NULL)
- expect_false( "cyl" %in% names(data) )
+ expect_false("cyl" %in% names(data))
data <- mtcars %>% group_by(disp) %>% mutate(cyl = NULL)
- expect_false( "cyl" %in% names(data) )
+ expect_false("cyl" %in% names(data))
+})
+
+test_that("mutate strips names, but only if grouped (#1689, #2675)", {
+ data <- data_frame(a = 1:3) %>% mutate(b = setNames(nm = a))
+ expect_equal(names(data$b), as.character(1:3))
+
+ data <- data_frame(a = 1:3) %>% rowwise %>% mutate(b = setNames(nm = a))
+ expect_null(names(data$b))
+
+ data <- data_frame(a = c(1, 1, 2)) %>% group_by(a) %>% mutate(b = setNames(nm = a))
+ expect_null(names(data$b))
+})
+
+test_that("mutate does not strip names of list-columns (#2675)", {
+ vec <- list(a = 1, b = 2)
+ data <- data_frame(x = vec)
+ data <- mutate(data, x)
+ expect_identical(names(vec), c("a", "b"))
+ expect_identical(names(data$x), c("a", "b"))
+})
+
+test_that("mutate gives a nice error message if an expression evaluates to NULL (#2187)", {
+ df <- data_frame(a = 1:3)
+ gf <- group_by(df, a)
+ rf <- rowwise(df)
+
+ expect_error(
+ mutate(df, b = identity(NULL)),
+ "Column `b` is of unsupported type NULL",
+ fixed = TRUE
+ )
+ expect_error(
+ mutate(gf, b = identity(NULL)),
+ "Column `b` is of unsupported type NULL",
+ fixed = TRUE
+ )
+ expect_error(
+ mutate(rf, b = identity(NULL)),
+ "Column `b` is of unsupported type NULL",
+ fixed = TRUE
+ )
})
test_that("mutate(rowwise_df) makes a rowwise_df (#463)", {
- one_mod <- data.frame(grp = "a", x = runif(5,0,1)) %>%
+ one_mod <- data.frame(grp = "a", x = runif(5, 0, 1)) %>%
tbl_df %>%
- mutate(y = rnorm(x,x*2,1)) %>%
+ mutate(y = rnorm(x, x * 2, 1)) %>%
group_by(grp) %>%
- do(mod = lm(y~x,data = .))
+ do(mod = lm(y~x, data = .))
out <- one_mod %>%
mutate(rsq = summary(mod)$r.squared) %>%
@@ -229,23 +319,24 @@ test_that("mutate(rowwise_df) makes a rowwise_df (#463)", {
expect_is(out, "rowwise_df")
expect_equal(nrow(out), 1L)
expect_is(out$mod, "list")
- expect_is(out$mod[[1L]], "lm" )
+ expect_is(out$mod[[1L]], "lm")
})
test_that("mutate allows list columns (#555)", {
df <- data.frame(x = c("a;b", "c;d;e"), stringsAsFactors = FALSE)
- res <- mutate( df, pieces = strsplit(x, ";"))
+ res <- mutate(df, pieces = strsplit(x, ";"))
expect_equal(res$pieces, list(c("a", "b"), c("c", "d", "e")))
})
test_that("hybrid evaluation goes deep enough (#554)", {
res1 <- iris %>% mutate(test = 1 == 2 | row_number() < 10)
res2 <- iris %>% mutate(test = row_number() < 10 | 1 == 2)
- expect_equal(res1,res2)
+ expect_equal(res1, res2)
})
test_that("hybrid does not segfault when given non existing variable (#569)", {
- expect_error( mtcars %>% summarise(first(mp)), "could not find variable" )
+ # error message from rlang
+ expect_error(mtcars %>% summarise(first(mp)))
})
test_that("namespace extraction works in hybrid (#412)", {
@@ -272,139 +363,166 @@ test_that("mutate supports difftime objects (#390)", {
grp = c(1, 1, 2, 2),
val = c(1, 3, 4, 6),
date1 = c(rep(Sys.Date() - 10, 2), rep(Sys.Date() - 20, 2)),
- date2 = Sys.Date() + c(1,2,1,2),
+ date2 = Sys.Date() + c(1, 2, 1, 2),
diffdate = difftime(date2, date1, unit = "days")
)
- res <- df %>% group_by(grp) %>%
- mutate(mean_val = mean(val), mean_diffdate = mean(diffdate) )
+ res <- df %>%
+ group_by(grp) %>%
+ mutate(mean_val = mean(val), mean_diffdate = mean(diffdate))
expect_is(res$mean_diffdate, "difftime")
- expect_equal( as.numeric(res$mean_diffdate), c(11.5,11.5,21.5,21.5))
+ expect_equal(as.numeric(res$mean_diffdate), c(11.5, 11.5, 21.5, 21.5))
res <- df %>% group_by(grp) %>% summarise(dt = mean(diffdate))
- expect_is( res$dt, "difftime" )
- expect_equal( as.numeric(res$dt), c(11.5,21.5) )
+ expect_is(res$dt, "difftime")
+ expect_equal(as.numeric(res$dt), c(11.5, 21.5))
})
test_that("mutate works on zero-row grouped data frame (#596)", {
dat <- data.frame(a = numeric(0), b = character(0))
- res <- dat %>% group_by(b) %>% mutate(a2 = a*2)
+ res <- dat %>% group_by(b) %>% mutate(a2 = a * 2)
expect_is(res$a2, "numeric")
expect_is(res, "grouped_df")
expect_equal(res$a2, numeric(0))
expect_equal(attr(res, "indices"), list())
- expect_equal(attr(res, "vars"), list( quote(b) ))
+ expect_equal(attr(res, "vars"), "b")
expect_equal(attr(res, "group_sizes"), integer(0))
expect_equal(attr(res, "biggest_group_size"), 0L)
})
test_that("Non-ascii column names in version 0.3 are not duplicated (#636)", {
- df <- data_frame(a = "1", b = "2")
+ # Currently failing (#2967)
+ skip_on_os("windows")
+ df <- data_frame(a = "1", b = "2")
names(df) <- c("a", enc2native("\u4e2d"))
- res <- df %>% mutate_each(funs(as.numeric)) %>% names
+ res <- df %>% mutate_all(funs(as.numeric)) %>% names
expect_equal(res, names(df))
})
test_that("nested hybrid functions do the right thing (#637)", {
res <- mtcars %>% mutate(mean(1))
- expect_true( all( res[["mean(1)"]] == 1L ) )
+ expect_true(all(res[["mean(1)"]] == 1L))
})
test_that("mutate handles using and gathering complex data (#436)", {
- d <- data_frame(x=1:10, y=1:10+2i)
- res <- mutate(d, real=Re(y), imag=Im(y), z=2*y, constant=2+2i)
+ d <- data_frame(x = 1:10, y = 1:10 + 2i)
+ res <- mutate(d, real = Re(y), imag = Im(y), z = 2 * y, constant = 2 + 2i)
expect_equal(names(res), c("x", "y", "real", "imag", "z", "constant"))
expect_equal(res$real, Re(d$y))
expect_equal(res$imag, Im(d$y))
expect_equal(res$z, d$y * 2)
- expect_true( all(res$constant == 2+2i) )
+ expect_true(all(res$constant == 2 + 2i))
})
test_that("mutate forbids POSIXlt results (#670)", {
expect_error(
- data.frame(time='2014/01/01 10:10:10') %>% mutate(time=as.POSIXlt(time)),
- "does not support"
+ data.frame(time = "2014/01/01 10:10:10") %>%
+ mutate(time = as.POSIXlt(time)),
+ "Column `time` is of unsupported class POSIXlt",
+ fixed = TRUE
)
expect_error(
- data.frame(time='2014/01/01 10:10:10', a=2) %>% group_by(a) %>% mutate(time=as.POSIXlt(time)),
- "does not support"
+ data.frame(time = "2014/01/01 10:10:10", a = 2) %>%
+ group_by(a) %>%
+ mutate(time = as.POSIXlt(time)),
+ "Column `time` is of unsupported class POSIXlt",
+ fixed = TRUE
)
})
-test_that("constant factor can be handled by mutate (#715)",{
- d <- data_frame(x=1:2) %>% mutate(y=factor("A"))
- expect_true( is.factor(d$y) )
- expect_equal( d$y, factor( c("A", "A") ) )
+test_that("constant factor can be handled by mutate (#715)", {
+ d <- data_frame(x = 1:2) %>% mutate(y = factor("A"))
+ expect_true(is.factor(d$y))
+ expect_equal(d$y, factor(c("A", "A")))
})
test_that("row_number handles empty data frames (#762)", {
df <- data.frame(a = numeric(0))
res <- df %>% mutate(
- row_number_0 = row_number(), row_number_a = row_number(a), ntile = ntile(a, 2),
- min_rank = min_rank(a), percent_rank = percent_rank(a),
- dense_rank = dense_rank(a), cume_dist = cume_dist(a)
+ row_number_0 = row_number(),
+ row_number_a = row_number(a),
+ ntile = ntile(a, 2),
+ min_rank = min_rank(a),
+ percent_rank = percent_rank(a),
+ dense_rank = dense_rank(a),
+ cume_dist = cume_dist(a)
)
- expect_equal( names(res), c("a", "row_number_0", "row_number_a", "ntile", "min_rank", "percent_rank", "dense_rank", "cume_dist" ) )
- expect_equal( nrow(res), 0L )
+ expect_equal(
+ names(res),
+ c("a", "row_number_0", "row_number_a", "ntile", "min_rank", "percent_rank", "dense_rank", "cume_dist")
+ )
+ expect_equal(nrow(res), 0L)
})
test_that("no utf8 invasion (#722)", {
- skip_on_cran()
+ skip_on_os("windows")
- source("utf-8.R", local = TRUE)
+ source("utf-8.R", local = TRUE, encoding = "UTF-8")
})
test_that("mutate works on empty data frames (#1142)", {
df <- data.frame()
res <- df %>% mutate
- expect_equal( nrow(res), 0L )
- expect_equal( length(res), 0L )
+ expect_equal(nrow(res), 0L)
+ expect_equal(length(res), 0L)
res <- df %>% mutate(x = numeric())
- expect_equal( names(res), "x")
- expect_equal( nrow(res), 0L )
- expect_equal( length(res), 1L)
+ expect_equal(names(res), "x")
+ expect_equal(nrow(res), 0L)
+ expect_equal(length(res), 1L)
})
-test_that("mutate handles 0 rows rowwise #1300",{
- a <- data.frame(x= 1)
- b <- data.frame(y = character(), stringsAsFactors = F)
+test_that("mutate handles 0 rows rowwise (#1300)", {
+ a <- data_frame(x = 1)
+ b <- data_frame(y = character())
- g <- function(y){1}
- f <- function() { b %>% rowwise() %>% mutate(z = g(y))}
+ g <- function(y) {
+ 1
+ }
+ f <- function() {
+ b %>% rowwise() %>% mutate(z = g(y))
+ }
res <- f()
- expect_equal( nrow(res), 0L )
+ expect_equal(nrow(res), 0L)
- expect_error(a %>% mutate(b = f()), "wrong result size" )
- expect_error(a %>% rowwise() %>% mutate(b = f()), "incompatible size")
+ expect_error(
+ a %>% mutate(b = f()),
+ "Column `b` must be length 1 (the number of rows), not 2",
+ fixed = TRUE
+ )
+ expect_error(
+ a %>% rowwise() %>% mutate(b = f()),
+ "Column `b` must be length 1 (the group size), not 2",
+ fixed = TRUE
+ )
})
test_that("regression test for #637", {
res <- mtcars %>% mutate(xx = mean(1))
- expect_true( all(res$xx == 1))
+ expect_true(all(res$xx == 1))
res <- mtcars %>% mutate(xx = sum(mean(mpg)))
- expect_true( all( res$xx == sum(mean(mtcars$mpg))))
+ expect_true(all(res$xx == sum(mean(mtcars$mpg))))
})
test_that("mutate.rowwise handles factors (#886)", {
- res <- data.frame(processed=c("foo", "bar")) %>%
+ res <- data.frame(processed = c("foo", "bar")) %>%
rowwise() %>%
- mutate(processed_trafo=paste("test", processed))
- expect_equal( res$processed_trafo, c("test foo", "test bar"))
+ mutate(processed_trafo = paste("test", processed))
+ expect_equal(res$processed_trafo, c("test foo", "test bar"))
})
test_that("setting first column to NULL with mutate works (#1329)", {
- df <- data.frame(x = 1:10, y = 1:10)
- expect_equal( mutate(df, x=NULL), select(df,-x) )
- expect_equal( mutate(df, y=NULL), select(df,-y) )
+ df <- data.frame(x = 1:10, y = 1:10)
+ expect_equal(mutate(df, x = NULL), select(df, -x))
+ expect_equal(mutate(df, y = NULL), select(df, -y))
- gdf <- group_by(df, y)
- expect_equal( select(gdf, -x), mutate(gdf, x = NULL) )
+ gdf <- group_by(df, y)
+ expect_equal(select(gdf, -x), mutate(gdf, x = NULL))
})
test_that("mutate handles the all NA case (#958)", {
@@ -420,50 +538,29 @@ test_that("mutate handles the all NA case (#958)", {
group_by(x, y) %>%
mutate(max.sum = day[which.max(values)[1]]) %>%
mutate(adjusted_values = ifelse(day < max.sum, 30, values))
- expect_true( all(is.na( res$adjusted_values[1:12] )))
+ expect_true(all(is.na(res$adjusted_values[1:12])))
})
-test_that("rowwie mutate gives expected results (#1381)", {
- f <- function( x ) ifelse( x < 2, NA_real_, x )
- res <- data_frame( x = 1:3 ) %>% rowwise() %>% mutate( y = f(x) )
- expect_equal( res$y, c(NA,2,3) )
+test_that("rowwise mutate gives expected results (#1381)", {
+ f <- function(x) ifelse(x < 2, NA_real_, x)
+ res <- data_frame(x = 1:3) %>% rowwise() %>% mutate(y = f(x))
+ expect_equal(res$y, c(NA, 2, 3))
})
test_that("mutate handles factors (#1414)", {
- d <- data_frame( g = c(1,1,1,2,2,3,3), f = c("a", "b", "a", "a", "a", "b", "b" ) )
- res <- d %>% group_by(g) %>% mutate( f2 = factor(f) )
- expect_equal( as.character(res$f2), res$f)
-})
-
-test_that("mutate recognizes global #1469", {
- vs <- 4
- res <- mtcars %>% mutate(a = global(vs))
- expect_true( all(res$a == 4) )
- expect_error( mtcars %>% mutate(global("vs")), "global only handles symbols" )
- res <- mtcars %>% mutate(a = global(vs) + 1)
- expect_true( all(res$a == 5) )
- expect_error( mtcars %>% mutate(global("vs") + 1), "global only handles symbols" )
- res <- mtcars %>% mutate(a = 1+global(vs) )
- expect_true( all(res$a == 5) )
- expect_error( mtcars %>% mutate(1 + global("vs")), "global only handles symbols" )
-
- res <- mtcars %>% group_by(cyl) %>% mutate(a = global(vs))
- expect_true( all(res$a == 4) )
- expect_error( mtcars %>% group_by(cyl) %>% mutate(a = global("vs")), "global only handles symbols" )
- res <- mtcars %>% group_by(cyl) %>% mutate(a = global(vs)+1)
- expect_true( all(res$a == 5) )
- expect_error( mtcars %>% group_by(cyl) %>% mutate(a = global("vs") + 1), "global only handles symbols" )
-
- res <- mtcars %>% group_by(cyl) %>% mutate(a = 1+global(vs))
- expect_true( all(res$a == 5) )
- expect_error( mtcars %>% group_by(cyl) %>% mutate(a = 1 + global("vs")), "global only handles symbols" )
+ d <- data_frame(
+ g = c(1, 1, 1, 2, 2, 3, 3),
+ f = c("a", "b", "a", "a", "a", "b", "b")
+ )
+ res <- d %>% group_by(g) %>% mutate(f2 = factor(f, levels = c("a", "b")))
+ expect_equal(as.character(res$f2), res$f)
})
test_that("mutate handles results from one group with all NA values (#1463) ", {
- df <- data_frame( x = c(1, 2), y = c(1, NA))
- res <- df %>% group_by(x) %>% mutate( z = ifelse(y>1, 1, 2) )
- expect_true( is.na(res$z[2]) )
- expect_is( res$z, "numeric")
+ df <- data_frame(x = c(1, 2), y = c(1, NA))
+ res <- df %>% group_by(x) %>% mutate(z = ifelse(y > 1, 1, 2))
+ expect_true(is.na(res$z[2]))
+ expect_is(res$z, "numeric")
})
test_that("rowwise mutate handles the NA special case (#1448)", {
@@ -471,125 +568,216 @@ test_that("rowwise mutate handles the NA special case (#1448)", {
rowwise() %>%
mutate(l = ifelse(k > 0, 1, NA))
expect_is(res$l, "numeric")
- expect_true( is.na(res$l[1]) )
- expect_true( !anyNA(res$l[-1]) )
+ expect_true(is.na(res$l[1]))
+ expect_true(!anyNA(res$l[-1]))
res <- data.frame(k = rnorm(10)) %>%
rowwise() %>%
mutate(l = ifelse(k > 0, 1L, NA_integer_))
- expect_true( all(is.na(res$l[res$k <= 0]) ) )
- expect_true( !any(is.na(res$l[res$k > 0]) ) )
+ expect_true(all(is.na(res$l[res$k <= 0])))
+ expect_true(!any(is.na(res$l[res$k > 0])))
})
test_that("mutate disambiguates NA and NaN (#1448)", {
- Pass <- data.frame(P2 = c(0,3,2), F2 = c(0,2,0), id = 1:3)
- res <- Pass %>% group_by(id) %>% mutate(pass2 = P2/(P2 + F2))
- expect_true( is.nan(res$pass2[1]) )
+ Pass <- data.frame(P2 = c(0, 3, 2), F2 = c(0, 2, 0), id = 1:3)
+ res <- Pass %>%
+ group_by(id) %>%
+ mutate(pass2 = P2 / (P2 + F2))
+ expect_true(is.nan(res$pass2[1]))
- res <- Pass %>% rowwise %>% mutate(pass2 = P2/(P2 + F2))
- expect_true( is.nan(res$pass2[1]) )
+ res <- Pass %>%
+ rowwise %>%
+ mutate(pass2 = P2 / (P2 + F2))
+ expect_true(is.nan(res$pass2[1]))
Pass <- data_frame(
- P1 = c(2L, 0L, 10L,8L, 9L),
- F1 = c(0L, 2L, 0L, 4L,3L),
+ P1 = c(2L, 0L, 10L, 8L, 9L),
+ F1 = c(0L, 2L, 0L, 4L, 3L),
P2 = c(0L, 3L, 2L, 2L, 2L),
- F2 = c(0L, 2L, 0L, 1L,1L),
- id = c(1,2,4,4,5)
+ F2 = c(0L, 2L, 0L, 1L, 1L),
+ id = c(1, 2, 4, 4, 5)
)
res <- Pass %>%
- group_by(id) %>%
- dplyr::mutate(pass_rate = (P1 + P2) / (P1 + P2 + F1 + F2) * 100,
- pass_rate1 = P1 / (P1 + F1) * 100,
- pass_rate2 = P2 / (P2 + F2) * 100)
- expect_true( is.nan(res$pass_rate2[1]) )
+ group_by(id) %>%
+ mutate(
+ pass_rate = (P1 + P2) / (P1 + P2 + F1 + F2) * 100,
+ pass_rate1 = P1 / (P1 + F1) * 100,
+ pass_rate2 = P2 / (P2 + F2) * 100
+ )
+ expect_true(is.nan(res$pass_rate2[1]))
})
test_that("hybrid evaluator leaves formulas untouched (#1447)", {
d <- data_frame(g = 1:2, training = list(mtcars, mtcars * 2))
- mpg <- data.frame(x=1:10, y=1:10)
+ mpg <- data.frame(x = 1:10, y = 1:10)
res <- d %>%
group_by(g) %>%
mutate(lm_result = list(lm(mpg ~ wt, data = training[[1]])))
- expect_is( res$lm_result, "list" )
- expect_is( res$lm_result[[1]], "lm" )
- expect_is( res$lm_result[[2]], "lm" )
+ expect_is(res$lm_result, "list")
+ expect_is(res$lm_result[[1]], "lm")
+ expect_is(res$lm_result[[2]], "lm")
})
-test_that( "lead/lag inside mutate handles expressions as value for default (#1411) ", {
+test_that("lead/lag inside mutate handles expressions as value for default (#1411) ", {
df <- data_frame(x = 1:3)
- res <- mutate(df, leadn = lead(x, default = x[1]), lagn = lag(x, default = x[1]) )
- expect_equal( res$leadn, lead(df$x, default = df$x[1]) )
- expect_equal( res$lagn, lag(df$x, default = df$x[1]) )
+ res <- mutate(df, leadn = lead(x, default = x[1]), lagn = lag(x, default = x[1]))
+ expect_equal(res$leadn, lead(df$x, default = df$x[1]))
+ expect_equal(res$lagn, lag(df$x, default = df$x[1]))
res <- mutate(df, leadn = lead(x, default = c(1)), lagn = lag(x, default = c(1)))
- expect_equal( res$leadn, lead(df$x, default = 1) )
- expect_equal( res$lagn, lag(df$x, default = 1) )
-})
-
-test_that("mutate understands column. #1012", {
- ir1 <- mutate( iris, Sepal = Sepal.Length * Sepal.Width )
- ir2 <- mutate( iris, Sepal = column("Sepal.Length") * column("Sepal.Width") )
- expect_equal(ir1, ir2)
-
- ir1 <- mutate( group_by(iris, Species), Sepal = Sepal.Length * Sepal.Width )
- ir2 <- mutate( group_by(iris, Species), Sepal = column("Sepal.Length") * column("Sepal.Width") )
- expect_equal(ir1, ir2)
-
- ir <- iris %>% mutate( a = column("Species") )
- expect_equal( ir$a, ir$Species)
-
- ir <- iris %>% group_by(Species) %>% mutate( a = column("Species") )
- expect_equal( ir$a, ir$Species)
+ expect_equal(res$leadn, lead(df$x, default = 1))
+ expect_equal(res$lagn, lag(df$x, default = 1))
})
test_that("grouped mutate does not drop grouping attributes (#1020)", {
- d <- data.frame(subject=c('Jack','Jill'),id=c(2,1)) %>% group_by(subject)
+ d <- data.frame(subject = c("Jack", "Jill"), id = c(2, 1)) %>% group_by(subject)
a1 <- names(attributes(d))
- a2 <- names(attributes(d %>% mutate(foo=1)))
- expect_equal( setdiff(a1, a2), character(0) )
+ a2 <- names(attributes(d %>% mutate(foo = 1)))
+ expect_equal(setdiff(a1, a2), character(0))
})
-test_that("grouped mutate errors on incompatible column type (#1641)", {
- df <- data.frame(ID = rep(1:5, each = 3), x = 1:15) %>% group_by(ID)
- expect_error( mutate(df, foo = mean), 'Unsupported type CLOSXP for column "foo"')
+test_that("grouped mutate coerces integer + double -> double (#1892)", {
+ df <- data_frame(
+ id = c(1, 4),
+ value = c(1L, NA),
+ group = c("A", "B")
+ ) %>%
+ group_by(group) %>%
+ mutate(value = ifelse(is.na(value), 0, value))
+ expect_type(df$value, "double")
+ expect_identical(df$value, c(1, 0))
+})
+
+test_that("grouped mutate coerces factor + character -> character (WARN) (#1892)", {
+ factor_or_character <- function(x) {
+ if (x > 3) {
+ return(factor("hello"))
+ } else {
+ return("world")
+ }
+ }
+
+ df <- data_frame(
+ id = c(1, 4),
+ group = c("A", "B")
+ ) %>%
+ group_by(group)
+ expect_warning(
+ df <- df %>%
+ mutate(value = factor_or_character(id))
+ )
+ expect_type(df$value, "character")
+ expect_identical(df$value, c("world", "hello"))
})
test_that("lead/lag works on more complex expressions (#1588)", {
- df <- data_frame(x = rep(1:5,2), g = rep(1:2, each = 5) ) %>% group_by(g)
- res <- df %>% mutate( y = lead(x > 3) )
- expect_equal(res$y, rep(lead(1:5 > 3), 2) )
+ df <- data_frame(x = rep(1:5, 2), g = rep(1:2, each = 5)) %>% group_by(g)
+ res <- df %>% mutate(y = lead(x > 3))
+ expect_equal(res$y, rep(lead(1:5 > 3), 2))
})
test_that("Adding a Column of NA to a Grouped Table gives expected results (#1645)", {
dataset <- data_frame(A = 1:10, B = 10:1, group = factor(sample(LETTERS[25:26], 10, TRUE)))
res <- dataset %>% group_by(group) %>% mutate(prediction = factor(NA))
- expect_true( all(is.na(res$prediction) ) )
- expect_is( res$prediction, "factor")
- expect_equal( levels(res$prediction), character() )
+ expect_true(all(is.na(res$prediction)))
+ expect_is(res$prediction, "factor")
+ expect_equal(levels(res$prediction), character())
})
test_that("Deep copies are performed when needed (#1463)", {
- res <- data.frame(prob = c(F,T)) %>%
+ res <- data.frame(prob = c(F, T)) %>%
rowwise %>%
- mutate(model = list(x=prob) )
- expect_equal(unlist(res$model), c(FALSE,TRUE))
+ mutate(model = list(x = prob))
+ expect_equal(unlist(res$model), c(FALSE, TRUE))
- res <- data.frame(x=1:4, g=c(1,1,1,2)) %>%
+ res <- data.frame(x = 1:4, g = c(1, 1, 1, 2)) %>%
group_by(g) %>%
- mutate(model = list(y=x) )
+ mutate(model = list(y = x))
expect_equal(res$model[[1]], 1:3)
expect_equal(res$model[[4]], 4)
})
-test_that( "ntile falls back to R (#1750)", {
- res <- mutate( iris, a = ntile("Sepal.Length", 3))
- expect_equal( res$a, rep(1, 150))
+test_that("ntile falls back to R (#1750)", {
+ res <- mutate(iris, a = ntile("Sepal.Length", 3))
+ expect_equal(res$a, rep(1, 150))
+})
+
+test_that("mutate() names pronouns correctly (#2686)", {
+ expect_named(mutate(tibble(x = 1), .data$x), "x")
+ expect_named(mutate(tibble(x = 1), .data[["x"]]), "x")
+})
+
+test_that("mutate() supports unquoted values", {
+ df <- tibble(g = c(1, 1, 2, 2, 2), x = 1:5)
+ expect_identical(mutate(df, out = !! 1), mutate(df, out = 1))
+ expect_identical(mutate(df, out = !! 1:5), mutate(df, out = 1:5))
+ expect_identical(mutate(df, out = !! quote(1:5)), mutate(df, out = 1:5))
+ expect_error(mutate(df, out = !! 1:2), "must be length 5 (the number of rows)", fixed = TRUE)
+ expect_error(mutate(df, out = !! get_env()), "unsupported type")
+
+ gdf <- group_by(df, g)
+ expect_identical(mutate(gdf, out = !! 1), mutate(gdf, out = 1))
+ expect_identical(mutate(gdf, out = !! 1:5), group_by(mutate(df, out = 1:5), g))
+ expect_error(mutate(gdf, out = !! quote(1:5)), "must be length 2 (the group size)", fixed = TRUE)
+ expect_error(mutate(gdf, out = !! 1:2), "must be length 5 (the number of rows)", fixed = TRUE)
+ expect_error(mutate(gdf, out = !! get_env()), "unsupported type")
})
-test_that("mutate fails gracefully on raw columns (#1803)", {
+
+# Error messages ----------------------------------------------------------
+
+test_that("mutate fails gracefully on non-vector columns (#1803)", {
df <- data_frame(a = 1:3, b = as.raw(1:3))
- expect_error( mutate(df, a = 1), 'Unsupported type RAWSXP for column "b"' )
- expect_error( mutate(df, b = 1), 'Unsupported type RAWSXP for column "b"' )
- expect_error( mutate(df, c = 1), 'Unsupported type RAWSXP for column "b"' )
+ expect_error(
+ mutate(df, a = 1),
+ "Column `b` is of unsupported type raw vector",
+ fixed = TRUE
+ )
+ expect_error(
+ mutate(df, b = 1),
+ "Column `b` is of unsupported type raw vector",
+ fixed = TRUE
+ )
+ expect_error(
+ mutate(df, c = 1),
+ "Column `b` is of unsupported type raw vector",
+ fixed = TRUE
+ )
+})
+
+test_that("grouped mutate errors on incompatible column type (#1641)", {
+ expect_error(
+ tibble(x = 1) %>% mutate(y = mean),
+ "Column `y` is of unsupported type function",
+ fixed = TRUE
+ )
+ expect_error(
+ tibble(x = 1) %>% mutate(y = quote(a)),
+ "Column `y` is of unsupported type symbol",
+ fixed = TRUE
+ )
+})
+
+test_that("can reuse new variables", {
+ expect_equal(
+ data.frame(c = 1) %>% mutate(c, gc = mean(c)),
+ data.frame(c = 1, gc = 1)
+ )
+})
+
+test_that("can use character vectors in grouped mutate (#2971)", {
+ df <-
+ data_frame(x = 1:10000) %>%
+ group_by(x) %>%
+ mutate(y = as.character(runif(1L)),
+ z = as.character(runif(1L)))
+
+ expect_error(df %>% distinct(x, .keep_all = TRUE), NA)
+})
+
+test_that("mutate() to UTF-8 column names", {
+ df <- data_frame(a = 1) %>% mutate("\u5e78" := a)
+
+ expect_equal(colnames(df), c("a", "\u5e78"))
})
diff --git a/tests/testthat/test-count.r b/tests/testthat/test-n_distinct.R
similarity index 52%
rename from tests/testthat/test-count.r
rename to tests/testthat/test-n_distinct.R
index dd74e6c..c754a0c 100644
--- a/tests/testthat/test-count.r
+++ b/tests/testthat/test-n_distinct.R
@@ -1,25 +1,4 @@
-context("Count")
-
-test_that("can count variable called n", {
- df <- data.frame(n = c(1, 1, 2, 2, 2))
-
- out <- df %>% count(n)
- expect_equal(names(out), c("n", "nn"))
- expect_equal(out$nn, c(2, 3))
-
- out <- df %>% count(n, sort = TRUE)
- expect_equal(out$nn, c(3, 2))
-})
-
-test_that("grouped count includes group", {
- df <- data.frame(g = c(1, 2, 2, 2))
-
- res <- df %>% group_by(g) %>% count()
- expect_equal(names(res), c("g", "n"))
- expect_equal(res$n, c(1, 3))
-})
-
-# n_distinct --------------------------------------------------------------
+context("n_distinct")
test_that("count_distinct gives the correct results on iris", {
expect_equal(
@@ -46,6 +25,5 @@ test_that("count_distinct gives correct results for key types", {
})
test_that("n_distinct treats NA correctly in the REALSXP case (#384)", {
- expect_equal( n_distinct( c(1.0,NA,NA) ), 2 )
+ expect_equal(n_distinct(c(1.0, NA, NA)), 2)
})
-
diff --git a/tests/testthat/test-na-if.R b/tests/testthat/test-na-if.R
index d34a090..22f8710 100644
--- a/tests/testthat/test-na-if.R
+++ b/tests/testthat/test-na-if.R
@@ -1,7 +1,17 @@
context("na_if")
test_that("error for bad y length", {
- expect_error(na_if(1:3, 1:2), "must be length 1 or same length")
+ expect_error(
+ na_if(1:3, 1:2),
+ "`y` must be length 3 (same as `x`) or one, not 2",
+ fixed = TRUE
+ )
+
+ expect_error(
+ na_if(1, 1:2),
+ "`y` must be length 1 (same as `x`), not 2",
+ fixed = TRUE
+ )
})
test_that("scalar y replaces all matching x", {
@@ -9,7 +19,3 @@ test_that("scalar y replaces all matching x", {
expect_equal(na_if(x, 0), c(NA, 1, NA))
expect_equal(na_if(x, 1), c(0, NA, 0))
})
-
-test_that("is translated to NULL_IF", {
- expect_equal(translate_sql(na_if(x, 0L)), sql('NULL_IF("x", 0)'))
-})
diff --git a/tests/testthat/test-nth-value.R b/tests/testthat/test-nth-value.R
index ce4c3ab..fe690b1 100644
--- a/tests/testthat/test-nth-value.R
+++ b/tests/testthat/test-nth-value.R
@@ -19,24 +19,22 @@ test_that("indexing past ends returns default value", {
expect_equal(nth(1:4, 5), NA_integer_)
expect_equal(nth(1:4, -5), NA_integer_)
expect_equal(nth(1:4, -10), NA_integer_)
-
-
})
-test_that("first and last use default value for 0 length inputs", {
+test_that("first uses default value for 0 length vectors", {
+ expect_equal(first(logical()), NA)
+ expect_equal(first(integer()), NA_integer_)
expect_equal(first(numeric()), NA_real_)
- expect_equal(last(numeric()), NA_real_)
+ expect_equal(first(character()), NA_character_)
+ expect_equal(first(list()), NULL)
})
-test_that("default value returns appropriate missing for basic vectors", {
- expect_equal(default_missing(TRUE), NA)
- expect_equal(default_missing(1), NA_real_)
- expect_equal(default_missing(1L), NA_integer_)
- expect_equal(default_missing("a"), NA_character_)
- expect_equal(default_missing(list()), NULL)
-})
+test_that("firsts uses default value for 0 length augmented vectors", {
+ fc <- factor("a")[0]
+ dt <- Sys.Date()
+ tm <- Sys.time()
-test_that("default value errors for complicated structures", {
- expect_error(default_missing(factor("a")), "generate default for object")
- expect_error(default_missing(mtcars), "generate default for object")
+ expect_equal(first(fc[0]), fc[NA])
+ expect_equal(first(dt[0]), dt[NA])
+ expect_equal(first(tm[0]), tm[NA])
})
diff --git a/tests/testthat/test-output.R b/tests/testthat/test-output.R
deleted file mode 100644
index 9362592..0000000
--- a/tests/testthat/test-output.R
+++ /dev/null
@@ -1,34 +0,0 @@
-context("output")
-
-test_that("ungrouped output", {
- if (packageVersion("tibble") < "1.0-10")
- skip("need tibble 1.0-10 or later for this test")
-
- mtcars_mem <- src_memdb() %>% copy_to(mtcars, name = random_table_name())
- iris_mem <- src_memdb() %>% copy_to(iris, name = random_table_name())
-
- with_mock(
- `dplyr::sqlite_version` = function() "x.y.z",
- {
- expect_output_file_rel(
- print(mtcars_mem, n = 8L, width = 30L),
- "mtcars-8-30.txt")
-
- expect_output_file_rel(
- print(iris_mem, n = 5L, width = 30L),
- "iris-5-30.txt")
-
- expect_output_file_rel(
- print(iris_mem, n = 3L, width = 5L),
- "iris-3-5.txt")
-
- expect_output_file_rel(
- print(iris_mem, n = NULL, width = 70L),
- "iris--70.txt")
-
- expect_output_file_rel(
- print(iris_mem %>% head(), n = 30L, width = 80L),
- "iris-head-30-80.txt")
- }
- )
-})
diff --git a/tests/testthat/test-overscope.R b/tests/testthat/test-overscope.R
new file mode 100644
index 0000000..9a596f5
--- /dev/null
+++ b/tests/testthat/test-overscope.R
@@ -0,0 +1,13 @@
+context("overscope")
+
+test_that(".data has strict matching semantics (#2591)", {
+ expect_error(
+ data_frame(a = 1) %>% mutate(c = .data$b),
+ "Column `b`: not found in data"
+ )
+
+ expect_error(
+ data_frame(a = 1:3) %>% group_by(a) %>% mutate(c = .data$b),
+ "Column `b`: not found in data"
+ )
+})
diff --git a/tests/testthat/test-pull.R b/tests/testthat/test-pull.R
new file mode 100644
index 0000000..6094ff8
--- /dev/null
+++ b/tests/testthat/test-pull.R
@@ -0,0 +1,54 @@
+context("pull")
+
+test_that("default extracts last var from data frame", {
+ df <- data_frame(x = 1:10, y = 1:10)
+ expect_equal(pull(df), 1:10)
+})
+
+test_that("can extract by name, or positive/negative position", {
+ x <- 1:10
+ df <- data_frame(x = x, y = runif(10))
+
+ expect_equal(pull(df, x), x)
+ expect_equal(pull(df, 1L), x)
+ expect_equal(pull(df, 1), x)
+ expect_equal(pull(df, -2), x)
+ expect_equal(pull(df, -2L), x)
+})
+
+# select_var --------------------------------------------------------------
+
+test_that("errors for bad inputs", {
+ expect_error(
+ select_var(letters, letters),
+ "`var` must evaluate to a single number",
+ fixed = TRUE
+ )
+
+ expect_error(
+ select_var(letters, aa),
+ "object 'aa' not found",
+ fixed = TRUE
+ )
+
+ expect_error(
+ select_var(letters, 0),
+ "`var` must be a value between -26 and 26 (excluding zero), not 0",
+ fixed = TRUE
+ )
+ expect_error(
+ select_var(letters, 100),
+ "`var` must be a value between -26 and 26 (excluding zero), not 100",
+ fixed = TRUE
+ )
+ expect_error(
+ select_var(letters, -Inf),
+ "`var` must be a value between -26 and 26 (excluding zero), not NA",
+ fixed = TRUE
+ )
+ expect_error(
+ select_var(letters, NA_integer_),
+ "`var` must be a value between -26 and 26 (excluding zero), not NA",
+ fixed = TRUE
+ )
+})
diff --git a/tests/testthat/test-rank.R b/tests/testthat/test-rank.R
new file mode 100644
index 0000000..6d05847
--- /dev/null
+++ b/tests/testthat/test-rank.R
@@ -0,0 +1,26 @@
+context("rank")
+
+ntile_h <- function(x, n) {
+ tibble(x = x) %>%
+ mutate(y = ntile(x, n)) %>%
+ pull(y)
+}
+
+test_that("ntile ignores number of NAs", {
+ x <- c(1:3, NA, NA, NA)
+
+ expect_equal(ntile(x, 3), x)
+ expect_equal(ntile_h(x, 3), x)
+
+ x1 <- c(1L, 1L, 1L, NA, NA, NA)
+ expect_equal(ntile(x, 1), x1)
+ expect_equal(ntile_h(x, 1), x1)
+})
+
+test_that("ntile always returns an integer", {
+ expect_equal(ntile(numeric(), 3), integer())
+ expect_equal(ntile_h(numeric(), 3), integer())
+
+ expect_equal(ntile(NA, 3), NA_integer_)
+ expect_equal(ntile_h(NA, 3), NA_integer_)
+})
diff --git a/tests/testthat/test-rbind.R b/tests/testthat/test-rbind.R
new file mode 100644
index 0000000..6c62994
--- /dev/null
+++ b/tests/testthat/test-rbind.R
@@ -0,0 +1,309 @@
+context("rbind")
+
+rbind_list_warn <- function(...) {
+ expect_warning(ret <- rbind_list(...), "bind_rows")
+ ret
+}
+
+rbind_all_warn <- function(...) {
+ expect_warning(ret <- rbind_list(...), "bind_rows")
+ ret
+}
+
+df_var <- data.frame(
+ l = c(T, F, F),
+ i = c(1, 1, 2),
+ d = Sys.Date() + c(1, 1, 2),
+ f = factor(letters[c(1, 1, 2)]),
+ n = c(1, 1, 2) + 0.5,
+ t = Sys.time() + c(1, 1, 2),
+ c = letters[c(1, 1, 2)],
+ stringsAsFactors = FALSE
+)
+
+test_that("rbind_list works on key types", {
+ exp <- tbl_df( rbind( df_var, df_var, df_var ) )
+ expect_equal(
+ rbind_list_warn(df_var, df_var, df_var),
+ exp
+ )
+})
+
+test_that("rbind_list reorders columns", {
+ columns <- seq_len(ncol(df_var))
+ exp <- tbl_df( rbind( df_var, df_var, df_var ) )
+ expect_equal(
+ rbind_list_warn(
+ df_var,
+ df_var[, sample(columns)],
+ df_var[, sample(columns)]
+ ),
+ exp
+ )
+})
+
+test_that("rbind_list promotes integer to numeric", {
+ df <- data.frame( a = 1:5, b = 1:5 )
+ df2 <- df
+ df2$a <- as.numeric(df$a)
+
+ res <- rbind_list_warn(df, df2)
+ expect_equal( typeof(res$a), "double" )
+ expect_equal( typeof(res$b), "integer" )
+})
+
+test_that("rbind_list promotes factor to character", {
+ df <- data.frame( a = letters[1:5], b = 1:5, stringsAsFactors=TRUE )
+ df2 <- df
+ df2$a <- as.character(df$a)
+
+ res <- rbind_list_warn(df, df2)
+ expect_equal( typeof(res$a), "character" )
+})
+
+test_that("rbind_list doesn't promote factor to numeric", {
+ df1 <- data.frame( a = 1:5, b = 1:5 )
+ df2 <- data.frame( a = 1:5, b = factor(letters[1:5]) )
+
+ expect_error(rbind_list_warn(df1, df2))
+})
+
+test_that("rbind_list doesn't coerce integer to factor", {
+ df1 <- data.frame( a = 1:10, b = 1:10 )
+ df2 <- data.frame( a = 1:5, b = factor(letters[1:5]) )
+
+ expect_error(rbind_list_warn(df1, df2))
+})
+
+test_that( "rbind_list coerces factor to character when levels don't match", {
+ df1 <- data.frame( a = 1:3, b = factor(c("a", "b", "c")))
+ df2 <- data.frame( a = 1:3, b = factor(c("a", "b", "c"),
+ levels = c("b", "c", "a", "d")))
+
+ expect_warning(res <- rbind_list( df1, df2 ),
+ "Unequal factor levels: coercing to character")
+ expect_equal( res$b, c("a","b","c", "a","b","c" ) )
+})
+
+test_that( "rbind handles NULL",{
+ x <- cbind(a=1:10,b=1:10)
+ y <- data.frame(x)
+ res <- rbind_all_warn(list(y, y, NULL, y))
+ expect_equal(nrow(res), 30L)
+})
+
+test_that( "rbind handles NA in factors #279", {
+ xx <- as.data.frame(list(a=as.numeric(NA), b="c", c="d"))
+ zz <- as.data.frame(list(a=1, b=as.character(NA), c="b"))
+ expect_warning( res <- rbind_list( xx, zz ) )
+
+ expect_equal(res$a, c(NA,1.0))
+ expect_equal(res$b, c("c", NA))
+ expect_equal(res$c, c("d","b"))
+
+})
+
+test_that( "rbind_all only accepts data frames #288",{
+ ll <- list(c(1,2,3,4, 5), c(6, 7, 8, 9, 10))
+ expect_error(rbind_all_warn(ll))
+})
+
+test_that( "rbind propagates timezone for POSIXct #298", {
+ dates1 <- data.frame(ID=c("a", "b", "c"),
+ dates=structure(c(-247320000, -246196800, -245073600),
+ tzone = "GMT",
+ class = c("POSIXct", "POSIXt")),
+ stringsAsFactors=FALSE)
+
+ dates2 <- data.frame(ID=c("d", "e", "f"),
+ dates=structure(c(-243864000, -242654400, -241444800),
+ tzone = "GMT",
+ class = c("POSIXct", "POSIXt")),
+ stringsAsFactors=FALSE)
+
+ alldates <- rbind_list_warn(dates1, dates2)
+ expect_equal( attr( alldates$dates, "tzone" ), "GMT" )
+})
+
+test_that( "Collecter_Impl<REALSXP> can collect INTSXP. #321", {
+ res <- rbind_list_warn(data.frame(x = 0.5), data.frame(x = 1:3))
+ expect_equal( res$x, c(0.5, 1:3) )
+})
+
+test_that( "Collecter_Impl<INTSXP> can collect LGLSXP. #321", {
+ res <- rbind_list_warn(data.frame(x = 1:3), data.frame(x = NA))
+ expect_equal( res$x, c(1:3, NA) )
+})
+
+test_that("rbind_all handles list columns (#463)", {
+ dfl <- data.frame(x = I(list(1:2, 1:3, 1:4)))
+ res <- rbind_all_warn(list(dfl, dfl))
+ expect_equal(rep(dfl$x,2L), res$x)
+})
+
+test_that("rbind_all creates tbl_df object", {
+ res <- rbind_list_warn(tbl_df(mtcars))
+ expect_is( res, "tbl_df" )
+})
+
+test_that("string vectors are filled with NA not blanks before collection (#595)", {
+ one <- mtcars[1:10, -10]
+ two <- mtcars[11:32, ]
+ two$char_col <- letters[1:22]
+
+ res <- rbind_list_warn(one, two)
+ expect_true( all(is.na(res$char_col[1:10])) )
+})
+
+test_that("rbind handles data frames with no rows (#597)",{
+ empty <- data.frame(result = numeric())
+ expect_equal(rbind_list_warn(empty), tbl_df(empty))
+ expect_equal(rbind_list_warn(empty, empty), tbl_df(empty))
+ expect_equal(rbind_list_warn(empty, empty, empty), tbl_df(empty))
+})
+
+test_that("rbind handles all NA columns (#493)", {
+ mydata <- list(
+ data.frame(x=c("foo", "bar")),
+ data.frame(x=NA)
+ )
+ res <- rbind_all_warn(mydata)
+ expect_true( is.na(res$x[3]) )
+ expect_is( res$x, "factor" )
+
+ mydata <- list(
+ data.frame(x=NA),
+ data.frame(x=c("foo", "bar"))
+ )
+ res <- rbind_all_warn(mydata)
+ expect_true( is.na(res$x[1]) )
+ expect_is( res$x, "factor" )
+
+})
+
+test_that( "bind_rows handles complex. #933", {
+ df1 <- data.frame(r = c(1+1i, 2-1i))
+ df2 <- data.frame(r = c(1-1i, 2+1i))
+ df3 <- bind_rows(df1,df2)
+ expect_equal( nrow(df3), 4L)
+ expect_equal( df3$r, c(df1$r, df2$r) )
+})
+
+test_that("bind_rows is careful about column names encoding #1265", {
+ one <- data.frame(foo=1:3, bar=1:3); names(one) <- c("f\u00fc", "bar")
+ two <- data.frame(foo=1:3, bar=1:3); names(two) <- c("f\u00fc", "bar")
+ Encoding(names(one)[1]) <- "UTF-8"
+ expect_equal( names(one), names(two))
+ res <- bind_rows(one,two)
+ expect_equal(ncol(res), 2L)
+})
+
+test_that("bind_rows handles POSIXct (#1125)", {
+ df1 <- data.frame(date = as.POSIXct(NA))
+ df2 <- data.frame(date = as.POSIXct("2015-05-05"))
+ res <- bind_rows(df1,df2)
+ expect_equal(nrow(res),2L)
+ expect_true(is.na(res$date[1]))
+})
+
+test_that("bind_rows respects ordered factors (#1112)", {
+ l <- c("a", "b", "c", "d")
+ id <- factor(c("a", "c", "d"), levels = l, ordered = TRUE)
+ df <- data.frame(id = rep(id, 2), val = rnorm(6))
+ res <- bind_rows(df, df)
+ expect_is( res$id, "ordered")
+ expect_equal( levels(df$id), levels(res$id) )
+
+ res <- group_by(df, id) %>% do(na.omit(.))
+ expect_is( res$id, "ordered")
+ expect_equal( levels(df$id), levels(res$id) )
+})
+
+test_that("bind_rows can handle lists (#1104)", {
+ my_list <- list(list(x = 1, y = 'a'), list(x = 2, y = 'b'))
+ res <- bind_rows(my_list)
+ expect_equal(nrow(res), 2L)
+ expect_is(res$x, "numeric")
+ expect_is(res$y, "character")
+
+ res <- bind_rows(list(x = 1, y = 'a'), list(x = 2, y = 'b'))
+ expect_equal(nrow(res), 2L)
+ expect_is(res$x, "numeric")
+ expect_is(res$y, "character")
+})
+
+test_that("rbind_list keeps ordered factors (#948)", {
+ y <- rbind_list_warn(
+ data.frame(x = factor(c(1,2,3), ordered = TRUE)),
+ data.frame(x = factor(c(1,2,3), ordered = TRUE))
+ )
+ expect_is( y$x, "ordered" )
+ expect_equal( levels(y$x), as.character(1:3) )
+})
+
+test_that("bind handles POSIXct of different tz ", {
+ date1 <- structure(-1735660800, tzone = "America/Chicago", class = c("POSIXct", "POSIXt"))
+ date2 <- structure(-1735660800, tzone = "UTC", class = c("POSIXct", "POSIXt"))
+ date3 <- structure(-1735660800, class = c("POSIXct", "POSIXt"))
+
+ df1 <- data.frame( date = date1 )
+ df2 <- data.frame( date = date2 )
+ df3 <- data.frame( date = date3 )
+
+ res <- bind_rows(df1, df2)
+ expect_equal( attr(res$date, "tzone"), "UTC" )
+
+ res <- bind_rows(df1, df3)
+ expect_equal( attr(res$date, "tzone"), "America/Chicago" )
+
+ res <- bind_rows(df2, df3)
+ expect_equal( attr(res$date, "tzone"), "UTC" )
+
+ res <- bind_rows(df3, df3)
+ expect_equal( attr(res$date, "tzone"), NULL )
+
+ res <- bind_rows(df1, df2, df3)
+ expect_equal( attr(res$date, "tzone"), "UTC" )
+
+})
+
+test_that("bind_rows() creates a column of identifiers (#1337)", {
+ data1 <- mtcars[c(2, 3), ]
+ data2 <- mtcars[1, ]
+
+ out <- bind_rows(data1, data2, .id = "col")
+ out_list <- bind_rows(list(data1, data2), .id = "col")
+ expect_equal(names(out)[1], "col")
+ expect_equal(out$col, c("1", "1", "2"))
+ expect_equal(out_list$col, c("1", "1", "2"))
+
+ out_labelled <- bind_rows(one = data1, two = data2, .id = "col")
+ out_list_labelled <- bind_rows(list(one = data1, two = data2), .id = "col")
+ expect_equal(out_labelled$col, c("one", "one", "two"))
+ expect_equal(out_list_labelled$col, c("one", "one", "two"))
+})
+
+test_that("empty data frame are handled (#1346)", {
+ res <- data_frame() %>% bind_rows(data_frame(x = "a"))
+ expect_equal( nrow(res), 1L)
+})
+
+test_that("bind_rows handles POSIXct stored as integer (#1402)", {
+ now <- Sys.time()
+
+ df1 <- data.frame(time = now)
+ expect_equal( class(bind_rows(df1)$time), c("POSIXct", "POSIXt") )
+
+ df2 <- data.frame(time = seq(now, length.out = 1, by = 1))
+ expect_equal( class(bind_rows(df2)$time), c("POSIXct", "POSIXt") )
+
+ res <- bind_rows( df1, df2 )
+ expect_equal( class(res$time), c("POSIXct", "POSIXt") )
+ expect_true( all(res$time == c(df1$time, df2$time) ) )
+})
+
+test_that("bind_rows warns on binding factor and character (#1485)", {
+ df1 <- head(iris, 1)
+ df2 <- tail(iris, 1) %>% mutate(Species = as.character(Species))
+ expect_warning( bind_rows(df1, df2), "binding factor and character vector, coercing into character vector" )
+})
diff --git a/tests/testthat/test-recode.R b/tests/testthat/test-recode.R
index ce7fcd4..73fcb38 100644
--- a/tests/testthat/test-recode.R
+++ b/tests/testthat/test-recode.R
@@ -1,9 +1,32 @@
context("recode")
test_that("error if no arguments", {
- expect_error(recode(1:5), "No replacements provided")
- expect_error(recode("a"), "No replacements provided")
- expect_error(recode(factor("a")), "No replacements provided")
+ expect_error(
+ recode(1:5), "No replacements provided")
+ expect_error(
+ recode("a"), "No replacements provided")
+ expect_error(
+ recode(factor("a")), "No replacements provided")
+})
+
+test_that("error if unnamed", {
+ expect_error(
+ recode("a", b = 5, "c"),
+ "Argument 3 must be named, not unnamed"
+ )
+ expect_error(
+ recode(factor("a"), b = 5, "c"),
+ "Argument 3 must be named, not unnamed",
+ fixed = TRUE
+ )
+})
+
+test_that("error if missing given for factors", {
+ expect_error(
+ recode(factor("a"), a = 5, .missing = 10),
+ "`.missing` is not supported for factors",
+ fixed = TRUE
+ )
})
test_that("positional substitution works", {
@@ -32,11 +55,13 @@ test_that("named substitution works", {
test_that("missing values replaced by missing argument", {
expect_equal(recode(c(1, NA), "a"), c("a", NA))
expect_equal(recode(c(1, NA), "a", .missing = "b"), c("a", "b"))
+ expect_equal(recode(c(letters[1:3], NA), .missing = "A"), c("a", "b", "c", "A"))
})
test_that("unmatched value replaced by default argument", {
expect_warning(expect_equal(recode(c(1, 2), "a"), c("a", NA)))
expect_equal(recode(c(1, 2), "a", .default = "b"), c("a", "b"))
+ expect_equal(recode(letters[1:3], .default = "A"), c("A", "A", "A"))
})
test_that("missing and default place nicely together", {
@@ -71,34 +96,86 @@ test_that(".default is not aliased to .x when missing and not compatible", {
expect_warning(expect_equal(recode(n, `1` = "a"), c("a", NA, NA)))
})
+test_that("conversion of unreplaced values to NA gives warning", {
+ expect_warning(recode(1:3, `1` = "a"), "treated as NA")
+ expect_warning(recode_factor(letters[1:3], b = 1, c = 2))
+})
+
+test_that(".dot argument works correctly (PR #2110)", {
+ x1 <- letters[1:3]
+ x2 <- 1:3
+
+ expect_equal(
+ recode(x1, a = "apple", b = "banana", .default = NA_character_),
+ recode(x1, .default = NA_character_, !!! list(a = "apple", b = "banana"))
+ )
+ expect_equal(
+ recode(x1, a = "apple", b = "banana", .default = NA_character_),
+ recode(x1, a = "apple", .default = NA_character_, !!! list(b = "banana"))
+ )
+ expect_equal(
+ recode(x2, "1" = 4, "2" = 5, .default = NA_real_),
+ recode(x2, .default = NA_real_, !!! list("1" = 4, "2" = 5))
+ )
+ expect_equal(
+ recode(x2, "1" = 4, "2" = 5, .default = NA_real_),
+ recode(x2, "1" = 4, .default = NA_real_, !!! list("2" = 5))
+ )
+})
+
+
+# factor ------------------------------------------------------------------
+
test_that("default .default works with factors", {
expect_equal(recode(factor(letters[1:3]), a = "A"), factor(c("A", "b", "c")))
})
+test_that("can recode factor to double", {
+ f <- factor(letters[1:3])
+
+ expect_equal(recode(f, a = 1, b = 2, c = 3), c(1, 2, 3))
+ expect_equal(recode(f, a = 1, b = 2), c(1, 2, NA))
+ expect_equal(recode(f, a = 1, b = 2, .default = 99), c(1, 2, 99))
+})
+
test_that("recode_factor() handles .missing and .default levels", {
x <- c(1:3, NA)
- expect_warning(expect_equal(recode_factor(x, `1` = "z", `2` = "y"), factor(c("z", "y", NA, NA), levels = c("z", "y"))))
- expect_equal(recode_factor(x, `1` = "z", `2` = "y", .default = "D"), factor(c("z", "y", "D", NA), levels = c("z", "y", "D")))
- expect_equal(recode_factor(x, `1` = "z", `2` = "y", .default = "D", .missing = "M"), factor(c("z", "y", "D", "M"), c("z", "y", "D", "M")))
+ expect_warning(
+ expect_equal(
+ recode_factor(x, `1` = "z", `2` = "y"),
+ factor(c("z", "y", NA, NA), levels = c("z", "y"))
+ )
+ )
+ expect_equal(
+ recode_factor(x, `1` = "z", `2` = "y", .default = "D"),
+ factor(c("z", "y", "D", NA), levels = c("z", "y", "D"))
+ )
+ expect_equal(
+ recode_factor(x, `1` = "z", `2` = "y", .default = "D", .missing = "M"),
+ factor(c("z", "y", "D", "M"), c("z", "y", "D", "M"))
+ )
})
test_that("recode_factor() handles vector .default", {
- character_default <- recode_factor(factor(letters[1:3]), b = "z", c = "y", .default = letters[1:3])
- implicit_factor_default <- recode_factor(factor(letters[1:3]), b = "z", c = "y")
- implicit_character_default <- recode_factor(letters[1:3], b = "z", c = "y")
-
expected <- factor(c("a", "z", "y"), levels = c("z", "y", "a"))
- expect_equal(character_default, expected)
- expect_equal(implicit_factor_default, expected)
- expect_equal(implicit_character_default, expected)
+ x1 <- letters[1:3]
+ x2 <- factor(x1)
+
+ expect_equal(recode_factor(x1, b = "z", c = "y"), expected)
+ expect_equal(recode_factor(x2, b = "z", c = "y"), expected)
+
+ expect_equal(recode_factor(x1, b = "z", c = "y", .default = x1), expected)
+ expect_equal(recode_factor(x2, b = "z", c = "y", .default = x1), expected)
})
test_that("can recode factor with redundant levels", {
- expect_equal(recode(factor(letters[1:4]), d = "c", b = "a"), factor(c("a", "a", "c", "c"), levels = c("a", "c")))
- expect_equal(recode_factor(letters[1:4], d = "c", b = "a"), factor(c("a", "a", "c", "c"), levels = c("c", "a")))
+ expect_equal(
+ recode(factor(letters[1:4]), d = "c", b = "a"),
+ factor(c("a", "a", "c", "c"), levels = c("a", "c"))
+ )
+ expect_equal(
+ recode_factor(letters[1:4], d = "c", b = "a"),
+ factor(c("a", "a", "c", "c"), levels = c("c", "a"))
+ )
})
-test_that("conversion of unreplaced values to NA gives warning", {
- expect_warning(recode(1:3, `1` = "a"), "treated as NA")
- expect_warning(recode_factor(letters[1:3], b = 1, c = 2))
-})
diff --git a/tests/testthat/test-sample.R b/tests/testthat/test-sample.R
index 611079a..944224a 100644
--- a/tests/testthat/test-sample.R
+++ b/tests/testthat/test-sample.R
@@ -10,7 +10,7 @@ test_that("sample preserves class", {
expect_is(sample_frac(tbl_df(mtcars), 1), "tbl_df")
})
-# Ungrouped -------------------------------------------------------------------
+# Ungrouped --------------------------------------------------------------------
df <- data.frame(
x = 1:2,
@@ -18,15 +18,54 @@ df <- data.frame(
)
test_that("sample respects weight", {
- expect_error(sample_n(df, 2, weight = y), "too few positive probabilities")
+ # error message from base R
+ expect_error(sample_n(df, 2, weight = y))
expect_equal(sample_n(df, 1, weight = y)$x, 2)
- expect_error(sample_frac(df, 1, weight = y), "too few positive probabilities")
+ expect_error(
+ sample_frac(df, 2),
+ "`size` of sampled fraction must be less or equal to one, set `replace` = TRUE to use sampling with replacement",
+ fixed = TRUE
+ )
+ expect_error(
+ sample_frac(df %>% group_by(y), 2),
+ "`size` of sampled fraction must be less or equal to one, set `replace` = TRUE to use sampling with replacement",
+ fixed = TRUE
+ )
+ # error message from base R
+ expect_error(sample_frac(df, 1, weight = y))
expect_equal(sample_frac(df, 0.5, weight = y)$x, 2)
})
+test_that("sample_* error message", {
+ expect_error(
+ check_weight(letters[1:2], 2),
+ "`weight` must be a numeric, not character",
+ fixed = TRUE
+ )
+ expect_error(
+ check_weight(-1:-2, 2),
+ "`weight` must be a vector with all values nonnegative, not -1",
+ fixed = TRUE
+ )
+ expect_error(
+ check_weight(letters, 2),
+ "`weight` must be a numeric, not character"
+ )
+})
+
test_that("sample gives informative error for unknown type", {
- expect_error(sample_n(list()), "Don't know how to sample")
+ expect_error(
+ sample_n(list()),
+ "`tbl` must be a data frame, not list",
+ fixed = TRUE
+ )
+
+ expect_error(
+ sample_frac(list()),
+ "`tbl` must be a data frame, not list",
+ fixed = TRUE
+ )
})
# Grouped ----------------------------------------------------------------------
@@ -34,14 +73,18 @@ test_that("sample gives informative error for unknown type", {
test_that("sampling grouped tbl samples each group", {
sampled <- mtcars %>% group_by(cyl) %>% sample_n(2)
expect_is(sampled, "grouped_df")
- expect_equal(groups(sampled), list(quote(cyl)))
+ expect_groups(sampled, "cyl")
expect_equal(nrow(sampled), 6)
expect_equal(sampled$cyl, rep(c(4, 6, 8), each = 2))
})
test_that("can't sample more values than obs (without replacement)", {
by_cyl <- mtcars %>% group_by(cyl)
- expect_error(sample_n(by_cyl, 10), "Do you want replace = TRUE")
+ expect_error(
+ sample_n(by_cyl, 10),
+ "`size` must be less or equal than 7 (size of data), set `replace` = TRUE to use sampling with replacement",
+ fixed = TRUE
+ )
})
df2 <- data.frame(
@@ -54,9 +97,11 @@ df2 <- data.frame(
test_that("grouped sample respects weight", {
grp <- df2 %>% group_by(g)
- expect_error(sample_n(grp, 2, weight = y), "too few positive probabilities")
+ # error message from base R
+ expect_error(sample_n(grp, 2, weight = y))
expect_equal(sample_n(grp, 1, weight = y)$x, c(2, 2))
- expect_error(sample_frac(grp, 1, weight = y), "too few positive probabilities")
+ # error message from base R
+ expect_error(sample_frac(grp, 1, weight = y))
expect_equal(sample_frac(grp, 0.5, weight = y)$x, c(2, 2))
})
diff --git a/tests/testthat/test-select-helpers.R b/tests/testthat/test-select-helpers.R
index 682f25e..215377d 100644
--- a/tests/testthat/test-select-helpers.R
+++ b/tests/testthat/test-select-helpers.R
@@ -1,20 +1,28 @@
context("select-helpers")
+test_that("no set variables throws error", {
+ expect_error(
+ starts_with("z"),
+ "Variable context not set",
+ fixed = TRUE
+ )
+})
+
test_that("failed match removes all columns", {
- set_current_vars(c("x", "y"))
- on.exit(reset_current_vars())
+ old <- set_current_vars(c("x", "y"))
+ on.exit(set_current_vars(old))
- expect_equal(starts_with("z"), -(1:2))
- expect_equal(ends_with("z"), -(1:2))
- expect_equal(contains("z"), -(1:2))
- expect_equal(matches("z"), -(1:2))
- expect_equal(num_range("z", 1:3), -(1:2))
+ expect_equal(starts_with("z"), integer(0))
+ expect_equal(ends_with("z"), integer(0))
+ expect_equal(contains("z"), integer(0))
+ expect_equal(matches("z"), integer(0))
+ expect_equal(num_range("z", 1:3), integer(0))
})
test_that("matches return integer positions", {
- set_current_vars(c("abc", "acd", "bbc", "bbd", "eee"))
- on.exit(reset_current_vars())
+ old <- set_current_vars(c("abc", "acd", "bbc", "bbd", "eee"))
+ on.exit(set_current_vars(old))
expect_equal(starts_with("a"), c(1L, 2L))
expect_equal(ends_with("d"), c(2L, 4L))
@@ -23,12 +31,36 @@ test_that("matches return integer positions", {
})
test_that("throws with empty pattern is provided", {
+ # error messages from rlang
expect_error(starts_with(""))
expect_error(ends_with(""))
expect_error(contains(""))
expect_error(matches(""))
})
+test_that("can use a variable", {
+ vars <- "x"
+ names(vars) <- vars
+
+ expect_equal(select_vars(vars, starts_with(vars)), c(x = "x"))
+ expect_equal(select_vars(vars, ends_with(vars)), c(x = "x"))
+ expect_equal(select_vars(vars, contains(vars)), c(x = "x"))
+ expect_equal(select_vars(vars, matches(vars)), c(x = "x"))
+})
+
+test_that("can use a variable even if it exists in the data (#2266)", {
+ vars <- c("x", "y")
+ names(vars) <- vars
+
+ y <- "x"
+ expected_result <- c(x = "x")
+
+ expect_equal(select_vars(vars, starts_with(y)), expected_result)
+ expect_equal(select_vars(vars, ends_with(y)), expected_result)
+ expect_equal(select_vars(vars, contains(y)), expected_result)
+ expect_equal(select_vars(vars, matches(y)), expected_result)
+})
+
test_that("num_range selects numeric ranges", {
vars <- c("x1", "x2", "x01", "x02", "x10", "x11")
names(vars) <- vars
@@ -39,18 +71,30 @@ test_that("num_range selects numeric ranges", {
expect_equal(select_vars(vars, num_range("x", 10:11, width = 2)), vars[5:6])
})
+test_that("position must resolve to numeric variables throws error", {
+ expect_error(
+ select_vars(letters, !! list()),
+ 'must resolve to integer column positions',
+ fixed = TRUE
+ )
+})
+
# one_of ------------------------------------------------------------------
test_that("one_of gives useful errors", {
- expect_error(one_of(1L, vars = c("x", "y")), "must be a character vector")
+ expect_error(
+ one_of(1L, vars = c("x", "y")),
+ "All arguments must be character vectors, not integer",
+ fixed = TRUE
+ )
})
test_that("one_of tolerates but warns for unknown variables", {
vars <- c("x", "y")
expect_warning(res <- one_of("z", vars = vars), "Unknown variables: `z`")
- expect_equal(res, -(1:2))
+ expect_equal(res, integer(0))
expect_warning(res <- one_of(c("x", "z"), vars = vars), "Unknown variables: `z`")
expect_equal(res, 1L)
@@ -59,3 +103,174 @@ test_that("one_of tolerates but warns for unknown variables", {
test_that("one_of converts names to positions", {
expect_equal(one_of("a", "z", vars = letters), c(1L, 26L))
})
+
+test_that("one_of works with variables", {
+ vars <- c("x", "y")
+ expected_result <- c(x = "x")
+ var <- "x"
+ expect_equal(select_vars(vars, one_of(var)), expected_result)
+ # error messages from rlang
+ expect_error(select_vars(vars, one_of(`_x`)), "not found")
+ expect_error(select_vars(vars, one_of(`_y`)), "not found")
+})
+
+test_that("one_of works when passed variable name matches the column name (#2266)", {
+ vars <- c("x", "y")
+ expected_result <- c(x = "x")
+ x <- "x"
+ y <- "x"
+ expect_equal(select_vars(vars, one_of(!! x)), expected_result)
+ expect_equal(select_vars(vars, one_of(!! y)), expected_result)
+ expect_equal(select_vars(vars, one_of(y)), expected_result)
+})
+
+# first-selector ----------------------------------------------------------
+
+test_that("initial (single) selector defaults correctly (issue #2275)", {
+ cn <- setNames(nm = c("x", "y", "z"))
+
+ ### Single Column Selected
+
+ # single columns (present), explicit
+ expect_equal(select_vars(cn, x), cn["x"])
+ expect_equal(select_vars(cn, -x), cn[c("y", "z")])
+
+ # single columns (present), matched
+ expect_equal(select_vars(cn, contains("x")), cn["x"])
+ expect_equal(select_vars(cn, -contains("x")), cn[c("y", "z")])
+
+ # single columns (not present), explicit
+ expect_error(select_vars(cn, foo), "not found")
+ expect_error(select_vars(cn, -foo), "not found")
+
+ # single columns (not present), matched
+ expect_equal(select_vars(cn, contains("foo")), cn[integer()])
+ expect_equal(select_vars(cn, -contains("foo")), cn)
+})
+
+test_that("initial (of multiple) selectors default correctly (issue #2275)", {
+ cn <- setNames(nm = c("x", "y", "z"))
+
+ ### Multiple Columns Selected
+
+ # explicit(present) + matched(present)
+ expect_equal(select_vars(cn, x, contains("y")), cn[c("x", "y")])
+ expect_equal(select_vars(cn, x, -contains("y")), cn["x"])
+ expect_equal(select_vars(cn, -x, contains("y")), cn[c("y", "z")])
+ expect_equal(select_vars(cn, -x, -contains("y")), cn["z"])
+
+ # explicit(present) + matched(not present)
+ expect_equal(select_vars(cn, x, contains("foo")), cn["x"])
+ expect_equal(select_vars(cn, x, -contains("foo")), cn["x"])
+ expect_equal(select_vars(cn, -x, contains("foo")), cn[c("y", "z")])
+ expect_equal(select_vars(cn, -x, -contains("foo")), cn[c("y", "z")])
+
+ # matched(present) + explicit(present)
+ expect_equal(select_vars(cn, contains("x"), y), cn[c("x", "y")])
+ expect_equal(select_vars(cn, contains("x"), -y), cn["x"])
+ expect_equal(select_vars(cn, -contains("x"), y), cn[c("y", "z")])
+ expect_equal(select_vars(cn, -contains("x"), -y), cn["z"])
+
+ # matched(not present) + explicit(not present)
+ expect_error(select_vars(cn, contains("foo"), bar), "object 'bar' not found")
+ expect_error(select_vars(cn, contains("foo"), -bar), "object 'bar' not found")
+ expect_error(select_vars(cn, -contains("foo"), bar), "object 'bar' not found")
+ expect_error(select_vars(cn, -contains("foo"), -bar), "object 'bar' not found")
+
+ # matched(present) + matched(present)
+ expect_equal(select_vars(cn, contains("x"), contains("y")), cn[c("x", "y")])
+ expect_equal(select_vars(cn, contains("x"), -contains("y")), cn["x"])
+ expect_equal(select_vars(cn, -contains("x"), contains("y")), cn[c("y", "z")])
+ expect_equal(select_vars(cn, -contains("x"), -contains("y")), cn["z"])
+
+ # matched(present) + matched(not present)
+ expect_equal(select_vars(cn, contains("x"), contains("foo")), cn["x"])
+ expect_equal(select_vars(cn, contains("x"), -contains("foo")), cn["x"])
+ expect_equal(select_vars(cn, -contains("x"), contains("foo")), cn[c("y", "z")])
+ expect_equal(select_vars(cn, -contains("x"), -contains("foo")), cn[c("y", "z")])
+
+ # matched(not present) + matched(present)
+ expect_equal(select_vars(cn, contains("foo"), contains("x")), cn["x"])
+ expect_equal(select_vars(cn, contains("foo"), -contains("x")), cn[integer()])
+ expect_equal(select_vars(cn, -contains("foo"), contains("x")), cn)
+ expect_equal(select_vars(cn, -contains("foo"), -contains("x")), cn[c("y", "z")])
+
+ # matched(not present) + matched(not present)
+ expect_equal(select_vars(cn, contains("foo"), contains("bar")), cn[integer()])
+ expect_equal(select_vars(cn, contains("foo"), -contains("bar")), cn[integer()])
+ expect_equal(select_vars(cn, -contains("foo"), contains("bar")), cn)
+ expect_equal(select_vars(cn, -contains("foo"), -contains("bar")), cn)
+})
+
+test_that("middle (no-match) selector should not clear previous selectors (issue #2275)", {
+ cn <- setNames(nm = c("x", "y", "z"))
+
+ expect_equal(
+ select_vars(cn, contains("x"), contains("foo"), contains("z")),
+ cn[c("x", "z")]
+ )
+ expect_equal(
+ select_vars(cn, contains("x"), -contains("foo"), contains("z")),
+ cn[c("x", "z")]
+ )
+})
+
+test_that("can select with c() (#2685)", {
+ expect_identical(select_vars(letters, c(a, z)), c(a = "a", z = "z"))
+})
+
+test_that("can select with .data pronoun (#2715)", {
+ expect_identical(select_vars("foo", .data$foo), c(foo = "foo"))
+ expect_identical(select_vars("foo", .data[["foo"]]), c(foo = "foo"))
+
+ expect_identical(select_vars(c("a", "b", "c"), .data$a : .data$b), c(a = "a", b = "b"))
+ expect_identical(select_vars(c("a", "b", "c"), .data[["a"]] : .data[["b"]]), c(a = "a", b = "b"))
+})
+
+
+# rename_vars -------------------------------------------------------------
+
+test_that("when strict = FALSE, rename_vars always succeeds", {
+ expect_error(
+ rename_vars(c("a", "b"), d = e, strict = TRUE),
+ "`e` contains unknown variables",
+ fixed = TRUE
+ )
+
+ expect_equal(
+ rename_vars(c("a", "b"), d = e, strict = FALSE),
+ c("a" = "a", "b" = "b")
+ )
+})
+
+test_that("rename_vars() expects symbol or string", {
+ expect_error(
+ rename_vars(letters, d = 1),
+ '`d` = 1 must be a symbol or a string',
+ fixed = TRUE
+ )
+})
+
+
+
+# tbl_at_vars -------------------------------------------------------------
+
+test_that("tbl_at_vars() errs on bad input", {
+ expect_error(
+ tbl_at_vars(iris, raw(3)),
+ "`.vars` must be a character/numeric vector or a `vars()` object, not raw",
+ fixed = TRUE
+ )
+})
+
+
+
+# tbl_if_vars -------------------------------------------------------------
+
+test_that("tbl_if_vars() errs on bad input", {
+ expect_error(
+ tbl_if_vars(iris, funs(identity, force), environment()),
+ "`.predicate` must have length 1, not 2",
+ fixed = TRUE
+ )
+})
diff --git a/tests/testthat/test-select.r b/tests/testthat/test-select.r
index d976c49..9ceb597 100644
--- a/tests/testthat/test-select.r
+++ b/tests/testthat/test-select.r
@@ -1,18 +1,10 @@
context("Select")
-df <- as.data.frame(as.list(setNames(1:26, letters)))
-tbls <- test_load(df)
-
-test_that("two selects equivalent to one", {
- compare_tbls(tbls, function(tbl) tbl %>% select(l:s) %>% select(n:o),
- ref = select(df, n:o))
-})
-
test_that("select does not lose grouping (#147)", {
- df <- tbl_df(data.frame(a = rep(1:4, 2), b = rep(1:4, each = 2), x = runif(8)))
+ df <- tibble(a = rep(1:4, 2), b = rep(1:4, each = 2), x = runif(8))
grouped <- df %>% group_by(a) %>% select(a, b, x)
- expect_equal(groups(grouped), list(quote(a)))
+ expect_groups(grouped, "a")
})
test_that("grouping variables preserved with a message (#1511)", {
@@ -37,6 +29,7 @@ test_that("select doesn't fail if some names missing", {
# expect_equal(select(df3, x), data.frame(x = 1:10))
})
+
# Empty selects -------------------------------------------------
test_that("select with no args returns nothing", {
@@ -72,46 +65,54 @@ test_that("last rename wins", {
test_that("negative index removes values", {
vars <- letters[1:3]
- expect_equal(select_vars(vars, -c), c("a" = "a", "b" = "b"))
- expect_equal(select_vars(vars, a:c, -c), c("a" = "a", "b" = "b"))
- expect_equal(select_vars(vars, a, b, c, -c), c("a" = "a", "b" = "b"))
- expect_equal(select_vars(vars, -c, a, b), c("a" = "a", "b" = "b"))
+ expect_equal(select_vars(vars, -c), c(a = "a", b = "b"))
+ expect_equal(select_vars(vars, a:c, -c), c(a = "a", b = "b"))
+ expect_equal(select_vars(vars, a, b, c, -c), c(a = "a", b = "b"))
+ expect_equal(select_vars(vars, -c, a, b), c(a = "a", b = "b"))
})
-test_that("select can be before group_by (#309)",{
- df <- data.frame(id=c(1,1,2,2,2,3,3,4,4,5), year=c(2013,2013,2012,2013,2013,2013,2012,2012,2013,2013), var1=rnorm(10))
+test_that("select can be before group_by (#309)", {
+ df <- data.frame(
+ id = c(1, 1, 2, 2, 2, 3, 3, 4, 4, 5),
+ year = c(2013, 2013, 2012, 2013, 2013, 2013, 2012, 2012, 2013, 2013),
+ var1 = rnorm(10)
+ )
dfagg <- df %>%
group_by(id, year) %>%
select(id, year, var1) %>%
- summarise(var1=mean(var1))
+ summarise(var1 = mean(var1))
expect_equal(names(dfagg), c("id", "year", "var1"))
- expect_equal(attr(dfagg, "vars" ), list(quote(id)))
+ expect_equal(attr(dfagg, "vars"), "id")
})
-# Database ---------------------------------------------------------------------
-
-test_that("select renames variables (#317)", {
- skip_if_no_sqlite()
-
- first <- tbls$sqlite %>% select(A = a)
- expect_equal(tbl_vars(first), "A")
- expect_equal(tbl_vars(first %>% select(A)), "A")
- expect_equal(tbl_vars(first %>% select(B = A)), "B")
+test_that("rename does not crash with invalid grouped data frame (#640)", {
+ df <- data_frame(a = 1:3, b = 2:4, d = 3:5) %>% group_by(a, b)
+ df$a <- NULL
+ expect_equal(
+ df %>% rename(e = d) %>% ungroup,
+ data_frame(b = 2:4, e = 3:5)
+ )
+ expect_equal(
+ df %>% rename(e = b) %>% ungroup,
+ data_frame(e = 2:4, d = 3:5)
+ )
})
-test_that("select preserves grouping vars", {
- skip_if_no_sqlite()
+test_that("can select with character vectors", {
+ expect_identical(select_vars(letters, "b", !! "z", c("b", "c")), set_names(c("b", "z", "c")))
+})
- first <- tbls$sqlite %>% group_by(b) %>% select(a)
- expect_equal(tbl_vars(first), c("b", "a"))
+test_that("abort on unknown columns", {
+ expect_error(select_vars(letters, "foo"), "must match column names")
+ expect_error(select_vars(letters, c("a", "bar", "foo", "d")), "bar, foo")
})
-test_that("rename handles grouped data (#640)", {
- res <- data_frame(a = 1, b = 2) %>% group_by(a) %>% rename(c = b)
- expect_equal(names(res), c("a", "c"))
+test_that("rename() handles data pronoun", {
+ expect_identical(rename(tibble(x = 1), y = .data$x), tibble(y = 1))
})
+
# combine_vars ------------------------------------------------------------
# This is the low C++ function which works on integer indices
@@ -154,9 +155,21 @@ test_that("if one name for multiple vars, use integer index", {
})
test_that("invalid inputs raise error", {
- expect_error(combine_vars(names(mtcars), list(0)), "positive or negative")
- expect_error(combine_vars(names(mtcars), list(c(-1, 1))), "positive or negative")
- expect_error(combine_vars(names(mtcars), list(12)), "must be between")
+ expect_error(
+ combine_vars(names(mtcars), list(0)),
+ "Each argument must yield either positive or negative integers",
+ fixed = TRUE
+ )
+ expect_error(
+ combine_vars(names(mtcars), list(c(-1, 1))),
+ "Each argument must yield either positive or negative integers",
+ fixed = TRUE
+ )
+ expect_error(
+ combine_vars(names(mtcars), list(12)),
+ "Position must be between 0 and n",
+ fixed = TRUE
+ )
})
test_that("select succeeds in presence of raw columns (#1803)", {
@@ -166,15 +179,35 @@ test_that("select succeeds in presence of raw columns (#1803)", {
expect_identical(select(df, -b), df["a"])
})
-test_that("select_if can use predicate", {
- expect_identical(iris %>% select_if(is.factor), iris["Species"])
+test_that("arguments to select() don't match select_vars() arguments", {
+ df <- tibble(a = 1)
+ expect_identical(select(df, var = a), tibble(var = 1))
+ expect_identical(select(group_by(df, a), var = a), group_by(tibble(var = 1), var))
+ expect_identical(select(df, exclude = a), tibble(exclude = 1))
+ expect_identical(select(df, include = a), tibble(include = 1))
+ expect_identical(select(group_by(df, a), exclude = a), group_by(tibble(exclude = 1), exclude))
+ expect_identical(select(group_by(df, a), include = a), group_by(tibble(include = 1), include))
+})
+
+test_that("arguments to rename() don't match rename_vars() arguments (#2861)", {
+ df <- tibble(a = 1)
+ expect_identical(rename(df, var = a), tibble(var = 1))
+ expect_identical(rename(group_by(df, a), var = a), group_by(tibble(var = 1), var))
+ expect_identical(rename(df, strict = a), tibble(strict = 1))
+ expect_identical(rename(group_by(df, a), strict = a), group_by(tibble(strict = 1), strict))
})
-test_that("select_if fails with databases", {
- expect_error(memdb_frame(x = 1) %>% select_if(is.numeric) %>% collect())
+test_that("can select() with .data pronoun (#2715)", {
+ expect_identical(select(mtcars, .data$cyl), select(mtcars, cyl))
})
-test_that("select_if keeps grouping cols", {
- expect_silent(df <- iris %>% group_by(Species) %>% select_if(is.numeric))
- expect_equal(df, tbl_df(iris[c(5, 1:4)]))
+test_that("can select() with character vectors", {
+ expect_identical(select(mtcars, "cyl", !! "disp", c("cyl", "am", "drat")), mtcars[c("cyl", "disp", "am", "drat")])
+})
+
+test_that("rename() to UTF-8 column names", {
+ skip_on_os("windows") # needs an rlang update? #3049
+ df <- data_frame(a = 1) %>% rename("\u5e78" := a)
+
+ expect_equal(colnames(df), "\u5e78")
})
diff --git a/tests/testthat/test-sets.R b/tests/testthat/test-sets.R
index 7ac065f..c0362ed 100644
--- a/tests/testthat/test-sets.R
+++ b/tests/testthat/test-sets.R
@@ -1,59 +1,56 @@
context("Set ops")
-tbls <- test_load(mtcars)
+test_that("set operation give useful error message. #903", {
+ alfa <- data_frame(
+ land = c("Sverige", "Norway", "Danmark", "Island", "GB"),
+ data = rnorm(length(land))
+ )
-test_that("results are the same across sources", {
- compare_tbls(tbls,
- function(x) setdiff(x, filter(x, cyl == 4)),
- filter(mtcars, cyl != 4)
+ beta <- data_frame(
+ land = c("Norge", "Danmark", "Island", "Storbritannien"),
+ data2 = rnorm(length(land))
)
- compare_tbls(tbls,
- function(x) intersect(x, filter(x, cyl == 4)),
- filter(mtcars, cyl == 4)
+ expect_error(
+ intersect(alfa, beta),
+ "not compatible: \n- Cols in y but not x: `data2`. \n- Cols in x but not y: `data`. \n",
+ fixed = TRUE
+ )
+ expect_error(
+ union(alfa, beta),
+ "not compatible: \n- Cols in y but not x: `data2`. \n- Cols in x but not y: `data`. \n",
+ fixed = TRUE
+ )
+ expect_error(
+ setdiff(alfa, beta),
+ "not compatible: \n- Cols in y but not x: `data2`. \n- Cols in x but not y: `data`. \n",
+ fixed = TRUE
)
- compare_tbls(tbls,
- function(x) union(x, filter(x, cyl == 4)),
- mtcars)
- compare_tbls(tbls,
- function(x) union(filter(x, cyl == 6), filter(x, cyl == 4)),
- filter(mtcars, cyl %in% c(4, 6)))
-})
-
-test_that("set operation give useful error message. #903", {
- alfa <- data_frame(land=c("Sverige","Norway","Danmark","Island","GB"),
- data=rnorm(length(land)))
-
- beta <- data_frame(land=c("Norge","Danmark","Island","Storbritannien"),
- data2=rnorm(length(land)))
- expect_error( intersect(alfa, beta), "Cols in y but not x" )
- expect_error( union(alfa, beta), "Cols in y but not x" )
- expect_error( setdiff(alfa, beta), "Cols in y but not x" )
})
test_that("set operations use coercion rules (#799)", {
df1 <- data_frame(x = 1:2, y = c(1, 1))
df2 <- data_frame(x = 1:2, y = 1:2)
- expect_equal( nrow(union(df1, df2)), 3L )
- expect_equal( nrow(intersect(df1, df2)), 1L )
- expect_equal( nrow(setdiff(df1, df2)), 1L )
+ expect_equal(nrow(union(df1, df2)), 3L)
+ expect_equal(nrow(intersect(df1, df2)), 1L)
+ expect_equal(nrow(setdiff(df1, df2)), 1L)
df1 <- data_frame(x = factor(letters[1:10]))
df2 <- data_frame(x = letters[6:15])
- expect_warning( { res <- intersect(df1, df2) })
- expect_equal( res, data_frame(x = letters[6:10]) )
- expect_warning( { res <- intersect(df2, df1) })
- expect_equal( res, data_frame(x = letters[6:10]) )
+ expect_warning(res <- intersect(df1, df2))
+ expect_equal(res, data_frame(x = letters[6:10]))
+ expect_warning(res <- intersect(df2, df1))
+ expect_equal(res, data_frame(x = letters[6:10]))
- expect_warning( { res <- union(df1, df2) })
- expect_equal( res, data_frame(x = letters[1:15]) )
- expect_warning( { res <- union(df2, df1) })
- expect_equal( res, data_frame(x = letters[1:15]) )
+ expect_warning(res <- union(df1, df2))
+ expect_equal(res, data_frame(x = letters[1:15]))
+ expect_warning(res <- union(df2, df1))
+ expect_equal(res, data_frame(x = letters[1:15]))
- expect_warning( { res <- setdiff(df1, df2) })
- expect_equal( res, data_frame(x = letters[1:5]) )
- expect_warning( { res <- setdiff(df2, df1) })
- expect_equal( res, data_frame(x = letters[11:15]) )
+ expect_warning(res <- setdiff(df1, df2))
+ expect_equal(res, data_frame(x = letters[1:5]))
+ expect_warning(res <- setdiff(df2, df1))
+ expect_equal(res, data_frame(x = letters[11:15]))
})
test_that("setdiff handles factors with NA (#1526)", {
@@ -61,13 +58,14 @@ test_that("setdiff handles factors with NA (#1526)", {
df2 <- data_frame(x = factor("a"))
res <- setdiff(df1, df2)
- expect_is( res$x, "factor")
- expect_equal( levels(res$x), "a")
- expect_true( is.na(res$x[1]) )
+ expect_is(res$x, "factor")
+ expect_equal(levels(res$x), "a")
+ expect_true(is.na(res$x[1]))
})
test_that("intersect does not unnecessarily coerce (#1722)", {
df <- data_frame(a = 1L)
- res <- intersect(df,df)
+ res <- intersect(df, df)
expect_is(res$a, "integer")
})
+
diff --git a/tests/testthat/test-slice.r b/tests/testthat/test-slice.r
index 073c7c1..0a48040 100644
--- a/tests/testthat/test-slice.r
+++ b/tests/testthat/test-slice.r
@@ -1,10 +1,10 @@
context("slice")
-test_that( "slice handles numeric input (#226)", {
+test_that("slice handles numeric input (#226)", {
g <- mtcars %>% group_by(cyl)
res <- g %>% slice(1)
expect_equal(nrow(res), 3)
- expect_equal(res, g %>% filter(row_number()==1L))
+ expect_equal(res, g %>% filter(row_number() == 1L))
expect_equal(
mtcars %>% slice(1),
@@ -12,15 +12,15 @@ test_that( "slice handles numeric input (#226)", {
)
})
-test_that( "slice silently ignores out of range values (#226)", {
- expect_equal( slice(mtcars, c(2,100)), slice(mtcars, 2))
+test_that("slice silently ignores out of range values (#226)", {
+ expect_equal(slice(mtcars, c(2, 100)), slice(mtcars, 2))
g <- group_by(mtcars, cyl)
- expect_equal( slice(g, c(2,100)), slice(g, 2))
+ expect_equal(slice(g, c(2, 100)), slice(g, 2))
})
-test_that( "slice works with 0 args", {
+test_that("slice works with 0 args", {
expect_equivalent(slice(mtcars), mtcars)
})
@@ -28,13 +28,17 @@ test_that("slice works with negative indices", {
res <- slice(mtcars, -(1:2))
exp <- tail(mtcars, -2)
expect_equal(names(res), names(exp))
- for( col in names(res)){
- expect_equal( res[[col]], exp[[col]] )
+ for (col in names(res)) {
+ expect_equal(res[[col]], exp[[col]])
}
})
test_that("slice forbids positive and negative together", {
- expect_error(mtcars %>% slice(c(-1,2)))
+ expect_error(
+ mtcars %>% slice(c(-1, 2)),
+ "Found 1 positive indices and 1 negative indices",
+ fixed = TRUE
+ )
})
test_that("slice works with grouped data", {
@@ -42,53 +46,75 @@ test_that("slice works with grouped data", {
res <- slice(g, 1:2)
exp <- filter(g, row_number() < 3)
- expect_equal(res,exp)
+ expect_equal(res, exp)
res <- slice(g, -(1:2))
exp <- filter(g, row_number() >= 3)
- expect_equal(res,exp)
+ expect_equal(res, exp)
})
test_that("slice gives correct rows (#649)", {
a <- data_frame(value = paste0("row", 1:10))
- expect_equal( slice(a, 1:3)$value, paste0("row", 1:3) )
- expect_equal( slice(a, c(4, 6, 9))$value, paste0("row", c(4,6,9)))
+ expect_equal(slice(a, 1:3)$value, paste0("row", 1:3))
+ expect_equal(slice(a, c(4, 6, 9))$value, paste0("row", c(4, 6, 9)))
- a <- data_frame(value = paste0("row", 1:10), group = rep(1:2, each = 5)) %>% group_by(group)
+ a <- data_frame(
+ value = paste0("row", 1:10),
+ group = rep(1:2, each = 5)
+ ) %>%
+ group_by(group)
- expect_equal( slice(a, 1:3)$value, paste0("row", c(1:3,6:8) ) )
- expect_equal( slice(a, c(2,4))$value, paste0("row", c(2,4,7,9)))
+ expect_equal(slice(a, 1:3)$value, paste0("row", c(1:3, 6:8)))
+ expect_equal(slice(a, c(2, 4))$value, paste0("row", c(2, 4, 7, 9)))
})
-test_that( "slice handles NA (#1235)", {
- df <- data_frame( x = 1:3 )
- expect_equal( nrow(slice(df, NA_integer_)), 0L )
- expect_equal( nrow(slice(df, c(1L, NA_integer_))), 1L )
- expect_equal( nrow(slice(df, c(-1L, NA_integer_))), 2L )
+test_that("slice handles NA (#1235)", {
+ df <- data_frame(x = 1:3)
+ expect_equal(nrow(slice(df, NA_integer_)), 0L)
+ expect_equal(nrow(slice(df, c(1L, NA_integer_))), 1L)
+ expect_equal(nrow(slice(df, c(-1L, NA_integer_))), 2L)
- df <- data_frame( x = 1:4, g = rep(1:2, 2) ) %>% group_by(g)
- expect_equal( nrow(slice(df, NA)), 0L )
- expect_equal( nrow(slice(df, c(1,NA))), 2 )
- expect_equal( nrow(slice(df, c(-1,NA))), 2 )
+ df <- data_frame(x = 1:4, g = rep(1:2, 2)) %>% group_by(g)
+ expect_equal(nrow(slice(df, NA)), 0L)
+ expect_equal(nrow(slice(df, c(1, NA))), 2)
+ expect_equal(nrow(slice(df, c(-1, NA))), 2)
})
test_that("slice handles empty data frames (#1219)", {
- df <- data.frame(x=numeric())
+ df <- data.frame(x = numeric())
res <- df %>% slice(1:3)
- expect_equal( nrow(res), 0L)
- expect_equal( names(res), "x" )
+ expect_equal(nrow(res), 0L)
+ expect_equal(names(res), "x")
})
test_that("slice works fine if n > nrow(df) (#1269)", {
slice_res <- mtcars %>% group_by(cyl) %>% slice(8)
- filter_res <- mtcars %>% group_by(cyl) %>% filter( row_number() == 8 )
- expect_equal( slice_res, filter_res )
+ filter_res <- mtcars %>% group_by(cyl) %>% filter(row_number() == 8)
+ expect_equal(slice_res, filter_res)
})
test_that("slice strips grouped indices (#1405)", {
res <- mtcars %>% group_by(cyl) %>% slice(1) %>% mutate(mpgplus = mpg + 1)
- expect_equal( nrow(res), 3L)
- expect_equal( attr(res, "indices"), as.list(0:2) )
+ expect_equal(nrow(res), 3L)
+ expect_equal(attr(res, "indices"), as.list(0:2))
+})
+
+test_that("slice works with zero-column data frames (#2490)", {
+ expect_equal(
+ data_frame(a = 1:3) %>% select(-a) %>% slice(1) %>% nrow,
+ 1L
+ )
+})
+
+test_that("slice works under gctorture2", {
+ x <- tibble(y = 1:10)
+ with_gctorture2(999, x2 <- slice(x, 1:10))
+ expect_identical(x, x2)
+})
+
+test_that("slice correctly computes positive indices from negative indices (#3073)", {
+ x <- tibble(y = 1:10)
+ expect_identical(slice(x, -10:-30), tibble(y = 1:9))
})
diff --git a/tests/testthat/test-sql-build.R b/tests/testthat/test-sql-build.R
deleted file mode 100644
index 8602607..0000000
--- a/tests/testthat/test-sql-build.R
+++ /dev/null
@@ -1,186 +0,0 @@
-context("SQL: build")
-
-test_that("base source of lazy frame is always 'df'", {
- out <- lazy_frame(x = 1, y = 5) %>% sql_build()
- expect_equal(out, ident("df"))
-})
-
-test_that("connection affects SQL generation", {
- lf <- lazy_frame(x = 1, y = 5) %>% summarise(n = n())
-
- out1 <- lf %>% sql_build()
- out2 <- lf %>% sql_build(con = structure(list(), class = "PostgreSQLConnection"))
-
- expect_equal(out1$select, sql('COUNT() AS "n"'))
- expect_equal(out2$select, sql('count(*) AS "n"'))
-})
-
-# select and rename -------------------------------------------------------
-
-test_that("select picks variables", {
- out <- lazy_frame(x1 = 1, x2 = 1, x3 = 2) %>%
- select(x1:x2) %>%
- sql_build()
-
- expect_equal(out$select, ident("x1" = "x1", "x2" = "x2"))
-})
-
-test_that("select renames variables", {
- out <- lazy_frame(x1 = 1, x2 = 1, x3 = 2) %>%
- select(y = x1, z = x2) %>%
- sql_build()
-
- expect_equal(out$select, ident("y" = "x1", "z" = "x2"))
-})
-
-test_that("select can refer to variables in local env", {
- vars <- c("x", "y")
- out <- lazy_frame(x = 1, y = 1) %>%
- select(one_of(vars)) %>%
- sql_build()
-
- expect_equal(out$select, ident("x" = "x", "y" = "y"))
-})
-
-test_that("rename preserves existing vars", {
- out <- lazy_frame(x = 1, y = 1) %>%
- rename(z = y) %>%
- sql_build()
-
- expect_equal(out$select, ident("x" = "x", "z" = "y"))
-})
-
-
-# arrange -----------------------------------------------------------------
-
-test_that("arrange generates order_by", {
- out <- lazy_frame(x = 1, y = 1) %>%
- arrange(x) %>%
- sql_build()
-
- expect_equal(out$order_by, sql('"x"'))
-})
-
-test_that("arrange converts desc", {
- out <- lazy_frame(x = 1, y = 1) %>%
- arrange(desc(x)) %>%
- sql_build()
-
- expect_equal(out$order_by, sql('"x" DESC'))
-})
-
-test_that("grouped arrange doesn't order by groups", {
- out <- lazy_frame(x = 1, y = 1) %>%
- group_by(x) %>%
- arrange(y) %>%
- sql_build()
-
- expect_equal(out$order_by, sql('"y"'))
-})
-
-
-# summarise ---------------------------------------------------------------
-
-test_that("summarise generates group_by and select", {
- out <- lazy_frame(g = 1) %>%
- group_by(x) %>%
- summarise(n = n()) %>%
- sql_build()
-
- expect_equal(out$group_by, sql('"x"'))
- expect_equal(out$select, sql('"x"', 'COUNT() AS "n"'))
-})
-
-
-# filter ------------------------------------------------------------------
-
-test_that("filter generates simple expressions", {
- out <- lazy_frame(x = 1) %>%
- filter(x > 1L) %>%
- sql_build()
-
- expect_equal(out$where, sql('"x" > 1'))
-})
-
-# mutate ------------------------------------------------------------------
-
-test_that("mutate generates simple expressions", {
- out <- lazy_frame(x = 1) %>%
- mutate(y = x + 1L) %>%
- sql_build()
-
- expect_equal(out$select, sql('"x"', '"x" + 1 AS "y"'))
-})
-
-# ungroup by --------------------------------------------------------------
-
-test_that("ungroup drops PARTITION BY", {
- out <- lazy_frame(x = 1) %>%
- group_by(x) %>%
- ungroup() %>%
- mutate(x = rank(x)) %>%
- sql_build()
- expect_equal(out$select, sql('rank() OVER (ORDER BY "x") AS "x"'))
-
-})
-
-# distinct ----------------------------------------------------------------
-
-test_that("distinct sets flagged", {
- out1 <- lazy_frame(x = 1) %>%
- select() %>%
- sql_build()
- expect_false(out1$distinct)
-
- out2 <- lazy_frame(x = 1) %>%
- distinct() %>%
- sql_build()
- expect_true(out2$distinct)
-})
-
-
-# head --------------------------------------------------------------------
-
-test_that("head limits rows", {
- out <- lazy_frame(x = 1:100) %>%
- head(10) %>%
- sql_build()
-
- expect_equal(out$limit, 10)
-})
-
-
-# joins -------------------------------------------------------------------
-
-test_that("join captures both tables", {
- lf1 <- lazy_frame(x = 1, y = 2)
- lf2 <- lazy_frame(x = 1, z = 2)
-
- out <- inner_join(lf1, lf2) %>% sql_build()
-
- expect_equal(op_vars(out$x), c("x", "y"))
- expect_equal(op_vars(out$y), c("x", "z"))
- expect_equal(out$type, "inner")
-})
-
-test_that("semi join captures both tables", {
- lf1 <- lazy_frame(x = 1, y = 2)
- lf2 <- lazy_frame(x = 1, z = 2)
-
- out <- semi_join(lf1, lf2) %>% sql_build()
-
- expect_equal(op_vars(out$x), c("x", "y"))
- expect_equal(op_vars(out$y), c("x", "z"))
- expect_equal(out$anti, FALSE)
-})
-
-test_that("set ops captures both tables", {
- lf1 <- lazy_frame(x = 1, y = 2)
- lf2 <- lazy_frame(x = 1, z = 2)
-
- out <- union(lf1, lf2) %>% sql_build()
-
- expect_equal(op_vars(out$x), c("x", "y"))
- expect_equal(op_vars(out$y), c("x", "z"))
- expect_equal(out$type, "UNION")
-})
diff --git a/tests/testthat/test-sql-escape.r b/tests/testthat/test-sql-escape.r
deleted file mode 100644
index 6d63b3e..0000000
--- a/tests/testthat/test-sql-escape.r
+++ /dev/null
@@ -1,35 +0,0 @@
-context("SQL: escaping")
-
-# Identifiers ------------------------------------------------------------------
-
-ei <- function(...) unclass(escape(ident(c(...))))
-
-test_that("identifiers are doubled quoted", {
- expect_equal(ei("x"), '"x"')
-})
-
-test_that("identifiers are comma separated", {
- expect_equal(ei("x", "y"), '"x", "y"')
-})
-
-test_that("identifier names become AS", {
- expect_equal(ei(x = "y"), '"y" AS "x"')
-})
-
-# Missing values ----------------------------------------------------------------
-
-test_that("missing vaues become null", {
- expect_equal(escape(NA), sql("NULL"))
- expect_equal(escape(NA_real_), sql("NULL"))
- expect_equal(escape(NA_integer_), sql("NULL"))
- expect_equal(escape(NA_character_), sql("NULL"))
-})
-
-
-# Times -------------------------------------------------------------------
-
-test_that("times are converted to ISO 8601", {
- x <- ISOdatetime(2000, 1, 2, 3, 4, 5, tz = "US/Central")
- expect_equal(escape(x), sql("'2000-01-02T09:04:05Z'"))
-})
-
diff --git a/tests/testthat/test-sql-joins.R b/tests/testthat/test-sql-joins.R
deleted file mode 100644
index eb5027e..0000000
--- a/tests/testthat/test-sql-joins.R
+++ /dev/null
@@ -1,51 +0,0 @@
-context("SQL: joins")
-
-src <- src_sqlite(tempfile(), create = TRUE)
-df1 <- copy_to(src, data.frame(x = 1:5, y = 1:5), "df1")
-df2 <- copy_to(src, data.frame(a = 5:1, b = 1:5), "df2")
-fam <- copy_to(src, data.frame(id = 1:5, parent = c(NA, 1, 2, 2, 4)), "fam")
-
-test_that("named by join by different x and y vars", {
- skip_if_no_sqlite()
-
- j1 <- collect(inner_join(df1, df2, c("x" = "a")))
- expect_equal(names(j1), c("x", "y", "a", "b"))
- expect_equal(nrow(j1), 5)
-
- j2 <- collect(inner_join(df1, df2, c("x" = "a", "y" = "b")))
- expect_equal(names(j2), c("x", "y", "a", "b"))
- expect_equal(nrow(j2), 1)
-})
-
-test_that("inner join doesn't result in duplicated columns ", {
- skip_if_no_sqlite()
- expect_equal(colnames(dplyr::inner_join(df1, df1)), c('x', 'y'))
-})
-
-test_that("self-joins allowed with named by", {
- skip_if_no_sqlite()
- fam <- memdb_frame(id = 1:5, parent = c(NA, 1, 2, 2, 4))
-
- j1 <- fam %>% left_join(fam, by = c("parent" = "id"))
- j2 <- fam %>% inner_join(fam, by = c("parent" = "id"))
-
- expect_equal(op_vars(j1), c("id.x", "parent.x", "id.y", "parent.y"))
- expect_equal(op_vars(j2), c("id.x", "parent.x", "id.y", "parent.y"))
- expect_equal(nrow(collect(j1)), 5)
- expect_equal(nrow(collect(j2)), 4)
-
- j3 <- collect(semi_join(fam, fam, by = c("parent" = "id")))
- j4 <- collect(anti_join(fam, fam, by = c("parent" = "id")))
-
- expect_equal(j3, filter(collect(fam), !is.na(parent)))
- expect_equal(j4, filter(collect(fam), is.na(parent)))
-})
-
-test_that("suffix modifies duplicated variable names", {
- skip_if_no_sqlite()
- j1 <- collect(inner_join(fam, fam, by = c("parent" = "id"), suffix = c("1", "2")))
- j2 <- collect(left_join(fam, fam, by = c("parent" = "id"), suffix = c("1", "2")))
-
- expect_named(j1, c("id1", "parent1", "id2", "parent2"))
- expect_named(j2, c("id1", "parent1", "id2", "parent2"))
-})
diff --git a/tests/testthat/test-sql-render.R b/tests/testthat/test-sql-render.R
deleted file mode 100644
index 2d9b145..0000000
--- a/tests/testthat/test-sql-render.R
+++ /dev/null
@@ -1,157 +0,0 @@
-context("SQL: render")
-# These test the full SQL rendering pipeline by running very simple examples
-# against a live SQLite database.
-
-# Single table ------------------------------------------------------------
-
-test_that("rendering table wraps in SELECT *", {
- out <- memdb_frame(x = 1)
- expect_match(out %>% sql_render, "^SELECT [*]\nFROM `[^`]*`$")
- expect_equal(out %>% collect, data_frame(x = 1))
-})
-
-test_that("quoting for rendering mutated grouped table", {
- out <- memdb_frame(x = 1, y = 2) %>% mutate(y = x)
- expect_match(out %>% sql_render, "^SELECT `x`, `x` AS `y`\nFROM `[^`]*`$")
- expect_equal(out %>% collect, data_frame(x = 1, y = 1))
-})
-
-test_that("quoting for rendering ordered grouped table", {
- out <- memdb_frame(x = 1, y = 2) %>% group_by(x) %>% arrange(y)
- expect_match(out %>% sql_render, "^SELECT [*]\nFROM `[^`]*`\nORDER BY `y`$")
- expect_equal(out %>% collect, data_frame(x = 1, y = 2))
-})
-
-test_that("quoting for rendering summarized grouped table", {
- out <- memdb_frame(x = 1) %>% group_by(x) %>% summarize(n = n())
- expect_match(out %>% sql_render, "^SELECT `x`, COUNT[(][)] AS `n`\nFROM `[^`]*`\nGROUP BY `x`$")
- expect_equal(out %>% collect, data_frame(x = 1, n = 1L))
-})
-
-# Single table verbs ------------------------------------------------------
-
-test_that("select quotes correctly", {
- out <- memdb_frame(x = 1, y = 1) %>%
- select(x) %>%
- collect()
- expect_equal(out, data_frame(x = 1))
-})
-
-test_that("select can rename", {
- out <- memdb_frame(x = 1, y = 2) %>%
- select(y = x) %>%
- collect()
- expect_equal(out, data_frame(y = 1))
-})
-
-test_that("distinct adds DISTINCT suffix", {
- out <- memdb_frame(x = c(1, 1)) %>% distinct()
-
- expect_match(out %>% sql_render(), "SELECT DISTINCT")
- expect_equal(out %>% collect(), data_frame(x = 1))
-})
-
-test_that("distinct over columns uses GROUP BY", {
- out <- memdb_frame(x = c(1, 2), y = c(1, 1)) %>% distinct(y)
-
- expect_match(out %>% sql_render(), "SELECT `y`.*GROUP BY `y`")
- expect_equal(out %>% collect(), data_frame(y = 1))
-})
-
-test_that("head limits rows returned", {
- out <- memdb_frame(x = 1:100) %>%
- head(10) %>%
- collect()
-
- expect_equal(nrow(out), 10)
-})
-
-test_that("head accepts fractional input", {
- out <- memdb_frame(x = 1:100) %>%
- head(10.5) %>%
- collect()
-
- expect_equal(nrow(out), 10)
-})
-
-test_that("head renders to integer fractional input", {
- out <- memdb_frame(x = 1:100) %>%
- head(10.5) %>%
- sql_render()
-
- expect_match(out, "LIMIT 10$")
-})
-
-test_that("head works with huge whole numbers", {
- out <- memdb_frame(x = 1:100) %>%
- head(1e10) %>%
- collect()
-
- expect_equal(out, data_frame(x = 1:100))
-})
-
-test_that("mutate overwrites previous variables", {
- df <- memdb_frame(x = 1:5) %>%
- mutate(x = x + 1) %>%
- mutate(x = x + 1) %>%
- collect()
-
- expect_equal(names(df), "x")
- expect_equal(df$x, 1:5 + 2)
-})
-
-test_that("sequence of operations work", {
- out <- memdb_frame(x = c(1, 2, 3, 4)) %>%
- select(y = x) %>%
- mutate(z = 2 * y) %>%
- filter(z == 2) %>%
- collect()
-
- expect_equal(out, data_frame(y = 1, z = 2))
-})
-
-test_that("compute creates correct column names", {
- out <- memdb_frame(x = 1) %>%
- group_by(x) %>%
- summarize(n = n()) %>%
- compute() %>%
- collect()
-
- expect_equal(out, data_frame(x = 1, n = 1L))
-})
-
-# Joins make valid sql ----------------------------------------------------
-
-test_that("join generates correct sql", {
- lf1 <- memdb_frame(x = 1, y = 2)
- lf2 <- memdb_frame(x = 1, z = 3)
-
- out <- lf1 %>%
- inner_join(lf2, by = "x") %>%
- collect()
-
- expect_equal(out, data.frame(x = 1, y = 2, z = 3))
-})
-
-test_that("semi join generates correct sql", {
- lf1 <- memdb_frame(x = c(1, 2), y = c(2, 3))
- lf2 <- memdb_frame(x = 1)
-
- out <- lf1 %>%
- inner_join(lf2, by = "x") %>%
- collect()
-
- expect_equal(out, data.frame(x = 1, y = 2))
-})
-
-
-test_that("set ops generates correct sql", {
- lf1 <- memdb_frame(x = 1)
- lf2 <- memdb_frame(x = c(1, 2))
-
- out <- lf1 %>%
- union(lf2) %>%
- collect()
-
- expect_equal(out, data.frame(x = c(1, 2)))
-})
diff --git a/tests/testthat/test-sql-translation.r b/tests/testthat/test-sql-translation.r
deleted file mode 100644
index 2376ae2..0000000
--- a/tests/testthat/test-sql-translation.r
+++ /dev/null
@@ -1,119 +0,0 @@
-context("SQL translation")
-
-test_that("Simple maths is correct", {
- expect_equal(translate_sql(1 + 2), sql("1.0 + 2.0"))
- expect_equal(translate_sql(2 * 4), sql("2.0 * 4.0"))
- expect_equal(translate_sql(5 ^ 2), sql("POWER(5.0, 2.0)"))
- expect_equal(translate_sql(100L %% 3L), sql("100 % 3"))
-})
-
-test_that("dplyr.strict_sql = TRUE prevents auto conversion", {
- old <- options(dplyr.strict_sql = TRUE)
- on.exit(options(old))
-
- expect_equal(translate_sql(1 + 2), sql("1.0 + 2.0"))
- expect_error(translate_sql(blah(x)), "could not find function")
-})
-
-test_that("Wrong number of arguments raises error", {
- expect_error(translate_sql(mean(1, 2), window = FALSE), "Invalid number of args")
-})
-
-test_that("Named arguments generates warning", {
- expect_warning(translate_sql(mean(x = 1), window = FALSE), "Named arguments ignored")
-})
-
-test_that("Subsetting always evaluated locally", {
- x <- list(a = 1, b = 1)
- y <- c(2, 1)
-
- correct <- quote(`_var` == 1)
-
- expect_equal(partial_eval(quote(`_var` == x$a)), correct)
- expect_equal(partial_eval(quote(`_var` == x[[2]])), correct)
- expect_equal(partial_eval(quote(`_var` == y[2])), correct)
-})
-
-test_that("between translated to special form (#503)", {
-
- out <- translate_sql(between(x, 1, 2))
- expect_equal(out, sql('"x" BETWEEN 1.0 AND 2.0'))
-})
-
-test_that("is.na and is.null are equivalent",{
- expect_equal(translate_sql(!is.na(x)), sql('NOT(("x") IS NULL)'))
- expect_equal(translate_sql(!is.null(x)), sql('NOT(("x") IS NULL)'))
-})
-
-test_that("if translation adds parens", {
- expect_equal(
- translate_sql(if (x) y),
- sql('CASE WHEN ("x") THEN ("y") END')
- )
- expect_equal(
- translate_sql(if (x) y else z),
- sql('CASE WHEN ("x") THEN ("y") ELSE ("z") END')
- )
-
-})
-
-test_that("pmin and pmax become min and max", {
- expect_equal(translate_sql(pmin(x, y)), sql('MIN("x", "y")'))
- expect_equal(translate_sql(pmax(x, y)), sql('MAX("x", "y")'))
-})
-
-# Minus -------------------------------------------------------------------
-
-test_that("unary minus flips sign of number", {
- expect_equal(translate_sql(-10L), sql("-10"))
- expect_equal(translate_sql(x == -10), sql('"x" = -10.0'))
- expect_equal(translate_sql(x %in% c(-1L, 0L)), sql('"x" IN (-1, 0)'))
-})
-
-test_that("unary minus wraps non-numeric expressions", {
- expect_equal(translate_sql(-(1L + 2L)), sql("-(1 + 2)"))
- expect_equal(translate_sql(-mean(x), window = FALSE), sql('-AVG("x")'))
-})
-
-test_that("binary minus subtracts", {
- expect_equal(translate_sql(1L - 10L), sql("1 - 10"))
-})
-
-# Window functions --------------------------------------------------------
-
-test_that("window functions without group have empty over", {
- expect_equal(translate_sql(n()), sql("COUNT(*) OVER ()"))
- expect_equal(translate_sql(sum(x)), sql('sum("x") OVER ()'))
-})
-
-test_that("aggregating window functions ignore order_by", {
- expect_equal(
- translate_sql(n(), vars_order = "x"),
- sql("COUNT(*) OVER ()")
- )
- expect_equal(
- translate_sql(sum(x), vars_order = "x"),
- sql('sum("x") OVER ()')
- )
-})
-
-test_that("cumulative windows warn if no order", {
- expect_warning(translate_sql(cumsum(x)), "does not have explicit order")
- expect_warning(translate_sql(cumsum(x), vars_order = "x"), NA)
-})
-
-test_that("ntile always casts to integer", {
- expect_equal(
- translate_sql(ntile(x, 10.5)),
- sql('NTILE(10) OVER (ORDER BY "x")')
- )
-})
-
-test_that("connection affects quoting character", {
- dbiTest <- structure(list(), class = "DBITestConnection")
- dbTest <- src_sql("test", con = dbiTest)
- testTable <- tbl_sql("test", src = dbTest, from = "table1")
-
- out <- select(testTable, field1)
- expect_match(sql_render(out), "^SELECT `field1` AS `field1`\nFROM `table1`$")
-})
diff --git a/tests/testthat/test-summarise.r b/tests/testthat/test-summarise.r
index 6d1222d..ceefba0 100644
--- a/tests/testthat/test-summarise.r
+++ b/tests/testthat/test-summarise.r
@@ -22,21 +22,16 @@ test_that("repeated outputs applied progressively (grouped_df)", {
})
-df <- data.frame(x = rep(1:4, each = 4), y = rep(1:2, each = 8), z = runif(16))
-tbls <- test_load(df)
-
test_that("summarise peels off a single layer of grouping", {
- for (i in seq_along(tbls)) {
- grouped <- group_by(tbls[[i]], x, y)
- summed <- summarise(grouped, n())
-
- expect_equal(unname(groups(summed)), list(quote(x)), info = names(tbls)[i])
- }
+ df <- tibble(x = rep(1:4, each = 4), y = rep(1:2, each = 8), z = runif(16))
+ grouped <- df %>% group_by(x, y, z)
+ expect_equal(group_vars(grouped), c("x", "y", "z"))
+ expect_equal(group_vars(grouped %>% summarise(n = n())), c("x", "y"))
})
test_that("summarise can refer to variables that were just created (#138)", {
- res <- summarise(tbl_df(mtcars), cyl1 = mean(cyl), cyl2 = cyl1 + 1 )
- expect_equal(res$cyl2, mean(mtcars$cyl)+1)
+ res <- summarise(tbl_df(mtcars), cyl1 = mean(cyl), cyl2 = cyl1 + 1)
+ expect_equal(res$cyl2, mean(mtcars$cyl) + 1)
gmtcars <- group_by(tbl_df(mtcars), am)
res <- summarise(gmtcars, cyl1 = mean(cyl), cyl2 = cyl1 + 1)
@@ -44,98 +39,159 @@ test_that("summarise can refer to variables that were just created (#138)", {
expect_equal(res$cyl2, res_direct$cyl2)
})
+test_that("summarise can refer to factor variables that were just created (#2217)", {
+ df <- data_frame(a = 1:3) %>%
+ group_by(a)
+ res <- df %>%
+ summarise(f = factor(if_else(a <= 1, "a", "b")), g = (f == "a"))
+ expect_equal(
+ res,
+ data_frame(a = 1:3, f = factor(c("a", "b", "b")), g = c(TRUE, FALSE, FALSE))
+ )
+})
+
test_that("summarise refuses to modify grouping variable (#143)", {
- df <- data.frame( a = c(1,2,1,2), b = c(1,1,2,2), x = 1:4 )
+ df <- data.frame(a = c(1, 2, 1, 2), b = c(1, 1, 2, 2), x = 1:4)
ds <- group_by(tbl_df(df), a, b)
expect_error(
summarise(ds, a = mean(x), a = b + 1),
- "cannot modify grouping variable"
+ "Column `a` can't be modified because it's a grouping variable"
)
})
test_that("summarise gives proper errors (#153)", {
- df <- data.frame(x=as.numeric(sample(1e3, 1e4, TRUE)),
- y=sample(1e4, 1e4, TRUE), z=runif(1e4))
- df <- tbl_df(df)
- df <- group_by(df, x, y)
- expect_error(summarise(df, diff(z)), "expecting a single value")
- expect_error(summarise(df, log(z)), "expecting a single value")
- expect_error(summarise(df, y[1:2]), "expecting a single value")
+ df <- data_frame(
+ x = 1,
+ y = c(1, 2, 2),
+ z = runif(3)
+ )
+ expect_error(
+ summarise(df, identity(NULL)),
+ "Column `identity(NULL)` is of unsupported type NULL",
+ fixed = TRUE
+ )
+ expect_error(
+ summarise(df, log(z)),
+ "Column `log(z)` must be length 1 (a summary value), not 3",
+ fixed = TRUE
+ )
+ expect_error(
+ summarise(df, y[1:2]),
+ "Column `y[1:2]` must be length 1 (a summary value), not 2",
+ fixed = TRUE
+ )
+ expect_error(
+ summarise(df, env(a = 1)),
+ "Column `env(a = 1)` is of unsupported type environment",
+ fixed = TRUE
+ )
+
+ gdf <- group_by(df, x, y)
+ expect_error(
+ summarise(gdf, identity(NULL)),
+ "Column `identity(NULL)` is of unsupported type NULL",
+ fixed = TRUE
+ )
+ expect_error(
+ summarise(gdf, z),
+ "Column `z` must be length 1 (a summary value), not 2",
+ fixed = TRUE
+ )
+ expect_error(
+ summarise(gdf, log(z)),
+ "Column `log(z)` must be length 1 (a summary value), not 2",
+ fixed = TRUE
+ )
+ expect_error(
+ summarise(gdf, y[1:2]),
+ "Column `y[1:2]` must be length 1 (a summary value), not 2",
+ fixed = TRUE
+ )
+ expect_error(
+ summarise(gdf, env(a = 1)),
+ "Column `env(a = 1)` is of unsupported type environment",
+ fixed = TRUE
+ )
})
test_that("summarise handles constants (#153)", {
- df <- data.frame(a=1:4)
+ df <- data.frame(a = 1:4)
today <- Sys.Date()
now <- Sys.time()
- res <- summarise(tbl_df(df), int = 1L, num = 1.0,
- str = "foo", bool = TRUE, date = today, time = now)
- expect_equal(res$int, 1L)
- expect_equal(res$num, 1.0)
- expect_equal(res$str, "foo")
+ res <- summarise(
+ tbl_df(df),
+ int = 1L, num = 1, str = "foo", bool = TRUE, date = today, time = now
+ )
+ expect_equal(res$int, 1L)
+ expect_equal(res$num, 1.0)
+ expect_equal(res$str, "foo")
expect_equal(res$bool, TRUE)
expect_equal(res$date, today)
expect_equal(res$time, now)
- res <- summarise(group_by(df,a), int = 1L, num = 1.0,
- str = "foo", bool = TRUE, date = today, time = now)
- expect_equal(res$int, rep(1L,4))
- expect_equal(res$num, rep(1.0,4))
- expect_equal(res$str, rep("foo",4))
- expect_equal(res$bool, rep(TRUE,4))
- expect_equal(res$date, rep(today,4))
- expect_equal(res$time, rep(now,4))
+ res <- summarise(
+ group_by(df, a),
+ int = 1L, num = 1, str = "foo", bool = TRUE, date = today, time = now
+ )
+ expect_equal(res$int, rep(1L, 4))
+ expect_equal(res$num, rep(1.0, 4))
+ expect_equal(res$str, rep("foo", 4))
+ expect_equal(res$bool, rep(TRUE, 4))
+ expect_equal(res$date, rep(today, 4))
+ expect_equal(res$time, rep(now, 4))
})
test_that("summarise handles passing ...", {
- df <- data.frame( x = 1:4 )
+ df <- data.frame(x = 1:4)
- f <- function(...){
+ f <- function(...) {
x1 <- 1
f1 <- function(x) x
- summarise(df, ..., x1 = f1(x1) )
+ summarise(df, ..., x1 = f1(x1))
}
- g <- function(...){
+ g <- function(...) {
x2 <- 2
f(x2 = x2, ...)
}
- h <- function(before = "before", ..., after = "after"){
- g(before = before, ..., after = after )
+ h <- function(before = "before", ..., after = "after") {
+ g(before = before, ..., after = after)
}
- res <- h( x3 = 3 )
- expect_equal(res$x1, 1 )
- expect_equal(res$x2, 2 )
+ res <- h(x3 = 3)
+ expect_equal(res$x1, 1)
+ expect_equal(res$x2, 2)
expect_equal(res$before, "before")
expect_equal(res$after, "after")
df <- tbl_df(df)
- res <- h( x3 = 3 )
- expect_equal(res$x1, 1 )
- expect_equal(res$x2, 2 )
+ res <- h(x3 = 3)
+ expect_equal(res$x1, 1)
+ expect_equal(res$x2, 2)
expect_equal(res$before, "before")
expect_equal(res$after, "after")
df <- group_by(df, x)
- res <- h( x3 = 3 )
- expect_equal(res$x1, rep(1, 4) )
- expect_equal(res$x2, rep(2, 4) )
- expect_equal(res$before, rep("before",4))
- expect_equal(res$after, rep("after",4))
+ res <- h(x3 = 3)
+ expect_equal(res$x1, rep(1, 4))
+ expect_equal(res$x2, rep(2, 4))
+ expect_equal(res$before, rep("before", 4))
+ expect_equal(res$after, rep("after", 4))
})
-test_that( "summarise propagate attributes (#194)", {
- df <- group_by(data.frame(
- b = rep(1:2,2),
+test_that("summarise propagate attributes (#194)", {
+ df <- data.frame(
+ b = rep(1:2, 2),
f = Sys.Date() + 1:4,
g = Sys.time() + 1:4,
stringsAsFactors = FALSE
- ), b)
+ ) %>%
+ group_by(b)
min_ <- min
- res <- summarise( df,
+ res <- summarise(df,
min_f = min(f),
max_f = max(f),
min_g = min(g),
@@ -144,161 +200,217 @@ test_that( "summarise propagate attributes (#194)", {
min__g = min_(g)
)
- expect_equal(class(res$min_f) , "Date" )
- expect_equal(class(res$max_f) , "Date" )
- expect_equal(class(res$min__f), "Date" )
+ expect_equal(class(res$min_f) , "Date")
+ expect_equal(class(res$max_f) , "Date")
+ expect_equal(class(res$min__f), "Date")
+
+ expect_equal(class(res$min_g) , c("POSIXct", "POSIXt"))
+ expect_equal(class(res$max_g) , c("POSIXct", "POSIXt"))
+ expect_equal(class(res$min__g), c("POSIXct", "POSIXt"))
+
+})
+
+test_that("summarise strips names, but only if grouped (#2231, #2675)", {
+ data <- data_frame(a = 1:3) %>% summarise(b = setNames(nm = a[[1]]))
+ expect_equal(names(data$b), "1")
- expect_equal(class(res$min_g) , c("POSIXct", "POSIXt" ))
- expect_equal(class(res$max_g) , c("POSIXct", "POSIXt" ))
- expect_equal(class(res$min__g), c("POSIXct", "POSIXt" ))
+ data <- data_frame(a = 1:3) %>% rowwise %>% summarise(b = setNames(nm = a))
+ expect_null(names(data$b))
+ data <- data_frame(a = c(1, 1, 2)) %>% group_by(a) %>% summarise(b = setNames(nm = a[[1]]))
+ expect_null(names(data$b))
})
test_that("summarise fails on missing variables", {
- expect_error(summarise(mtcars, a = mean(notthear)) )
+ # error messages from rlang
+ expect_error(summarise(mtcars, a = mean(notthear)))
})
-test_that("n() does not accept arguments",{
- expect_error(summarise(group_by(mtcars, cyl), n(hp)), "does not take arguments")
+test_that("summarise fails on missing variables when grouping (#2223)", {
+ # error messages from rlang
+ expect_error(summarise(group_by(mtcars, cyl), a = mean(notthear)))
+})
+
+test_that("n() does not accept arguments", {
+ expect_error(
+ summarise(group_by(mtcars, cyl), n(hp)),
+ "`n()` does not take arguments",
+ fixed = TRUE
+ )
})
test_that("hybrid nests correctly", {
- res <- group_by(mtcars, cyl) %>% summarise(a = if(n()>10) 1 else 2 )
- expect_equal(res$a, c(1,2,1))
+ res <- group_by(mtcars, cyl) %>%
+ summarise(a = if (n() > 10) 1 else 2)
+ expect_equal(res$a, c(1, 2, 1))
- res <- mtcars %>% summarise(a = if(n()>10) 1 else 2 )
+ res <- mtcars %>% summarise(a = if (n() > 10) 1 else 2)
expect_equal(res$a, 1)
})
test_that("hybrid min and max propagate attributes (#246)", {
- x <- data.frame(id=c(rep("a",2), rep("b",2)),
- date=as.POSIXct(c("2014-01-13", "2014-01-14",
- "2014-01-15", "2014-01-16"),
- tz="GMT"))
- y <- x %>% group_by(id) %>% summarise(max_date=max(date), min_date=min(date))
+ x <- data.frame(
+ id = c(rep("a", 2), rep("b", 2)),
+ date = as.POSIXct(c("2014-01-13", "2014-01-14", "2014-01-15", "2014-01-16"), tz = "GMT")
+ )
+ y <- x %>% group_by(id) %>% summarise(max_date = max(date), min_date = min(date))
expect_true("tzone" %in% names(attributes(y$min_date)))
expect_true("tzone" %in% names(attributes(y$max_date)))
})
test_that("summarise can use newly created variable more than once", {
- df <- data.frame(id=c(1,1,2,2,3,3), a=1:6) %>% group_by(id)
- for( i in 1:10){
- res <- summarise(df, biggest=max(a), smallest=min(a), diff1=biggest-smallest, diff2=smallest-biggest)
- expect_equal( res$diff1, -res$diff2)
+ df <- data.frame(id = c(1, 1, 2, 2, 3, 3), a = 1:6) %>% group_by(id)
+ for (i in 1:10) {
+ res <- summarise(
+ df,
+ biggest = max(a),
+ smallest = min(a),
+ diff1 = biggest - smallest,
+ diff2 = smallest - biggest
+ )
+ expect_equal(res$diff1, -res$diff2)
}
})
test_that("summarise creates an empty data frame when no parameters are used", {
res <- summarise(mtcars)
- expect_equal(res,data.frame())
+ expect_equal(res, data.frame())
})
-test_that("integer overflow (#304)",{
- groups <- rep(c('A', 'B'), each = 3)
- values <- rep(1e9, 6)
- dat <- data.frame(groups,
- X1 = as.integer(values),
- X2 = values)
+test_that("integer overflow (#304)", {
+ groups <- rep(c("A", "B"), each = 3)
+ values <- rep(1e9, 6)
+ dat <- data.frame(groups, X1 = as.integer(values), X2 = values)
# now group and summarise
expect_warning(
res <- group_by(dat, groups) %>%
summarise(sum_integer = sum(X1), sum_numeric = sum(X2)),
"integer overflow"
)
- expect_true( all(is.na(res$sum_integer)) )
- expect_equal( res$sum_numeric, rep(3e9, 2L) )
+ expect_true(all(is.na(res$sum_integer)))
+ expect_equal(res$sum_numeric, rep(3e9, 2L))
})
test_that("summarise checks outputs (#300)", {
- expect_error( summarise(mtcars, mpg, cyl) )
- expect_error( summarise(mtcars, mpg + cyl) )
+ expect_error(
+ summarise(mtcars, mpg, cyl),
+ "Column `mpg` must be length 1 (a summary value), not 32",
+ fixed = TRUE
+ )
+ expect_error(
+ summarise(mtcars, mpg + cyl),
+ "Column `mpg + cyl` must be length 1 (a summary value), not 32",
+ fixed = TRUE
+ )
})
-test_that("comment attribute is white listed (#346)",{
- test <- data.frame(A = c(1,1,0,0), B = c(2,2,3,3))
+test_that("comment attribute is white listed (#346)", {
+ test <- data.frame(A = c(1, 1, 0, 0), B = c(2, 2, 3, 3))
comment(test$B) <- "2nd Var"
res <- group_by(test, A)
- expect_equal(comment(res$B), "2nd Var" )
+ expect_equal(comment(res$B), "2nd Var")
})
-test_that("AsIs class is white listed (#453)",{
- test <- data.frame(A = c(1,1,0,0), B = I(c(2,2,3,3)))
+test_that("AsIs class is white listed (#453)", {
+ test <- data.frame(A = c(1, 1, 0, 0), B = I(c(2, 2, 3, 3)))
res <- group_by(test, B)
- expect_equal(res$B, test$B )
+ expect_equal(res$B, test$B)
})
test_that("names attribute is not retained (#357)", {
- df <- data.frame(x=c(1:3), y=letters[1:3])
+ df <- data.frame(x = c(1:3), y = letters[1:3])
df <- group_by(df, y)
m <- df %>% summarise(
- a=length(x),
- b=quantile(x, 0.5)
+ a = length(x),
+ b = quantile(x, 0.5)
)
- expect_equal(m$b, c(1,2,3))
+ expect_equal(m$b, c(1, 2, 3))
expect_null(names(m$b))
})
test_that("na.rm is supported (#168)", {
- df <- data.frame( x = c(1:5, NA, 7:10), y = rep(1:2, each = 5 ), z = c(rnorm(5), NA, rnorm(4) ) )
- res <- df %>% group_by(y) %>%
+ df <- data.frame(
+ x = c(1:5, NA, 7:10),
+ y = rep(1:2, each = 5),
+ z = c(rnorm(5), NA, rnorm(4))
+ )
+ res <- df %>%
+ group_by(y) %>%
summarise(
- mean_x = mean(x, na.rm = FALSE), mean_z = mean(z, na.rm = FALSE),
- min_x = min(x, na.rm = FALSE), min_z = min(z, na.rm = FALSE)
+ mean_x = mean(x, na.rm = FALSE),
+ mean_z = mean(z, na.rm = FALSE),
+ min_x = min(x, na.rm = FALSE),
+ min_z = min(z, na.rm = FALSE)
)
- expect_equal( res$mean_x[1], 3 )
- expect_true( is.na( res$mean_x[2] ) )
- expect_equal( res$mean_z[1], mean(df$z[1:5]) )
- expect_true( is.na(res$mean_z[2]) )
+ expect_equal(res$mean_x[1], 3)
+ expect_true(is.na(res$mean_x[2]))
+ expect_equal(res$mean_z[1], mean(df$z[1:5]))
+ expect_true(is.na(res$mean_z[2]))
- expect_equal( res$min_x[1], 1 )
- expect_true( is.na( res$min_x[2] ) )
- expect_equal( res$min_z[1], min(df$z[1:5]) )
- expect_true( is.na(res$min_z[2]) )
+ expect_equal(res$min_x[1], 1)
+ expect_true(is.na(res$min_x[2]))
+ expect_equal(res$min_z[1], min(df$z[1:5]))
+ expect_true(is.na(res$min_z[2]))
- res <- df %>% group_by(y) %>%
+ res <- df %>%
+ group_by(y) %>%
summarise(
- mean_x = mean(x, na.rm = TRUE), mean_z = mean(z, na.rm = TRUE),
- min_x = min(x, na.rm = TRUE), min_z = min(z, na.rm = TRUE)
- )
- expect_equal( res$mean_x[1], 3 )
- expect_equal( res$mean_x[2], 8.5 )
- expect_equal( res$mean_z[1], mean(df$z[1:5]) )
- expect_equal( res$mean_z[2], mean(df$z[7:10]) )
-
- expect_equal( res$min_x[1], 1 )
- expect_equal( res$min_x[2], 7 )
- expect_equal( res$min_z[1], min(df$z[1:5]) )
- expect_equal( res$min_z[2], min(df$z[7:10]) )
-
-})
-
-test_that( "summarise hybrid functions can use summarized variables", {
- df <- data.frame( x = c(1:5, NA, 7:10), y = rep(1:2, each = 5 ) ) %>% group_by(y)
- res <- summarise( df, x = mean(x), min = min(x), max = max(x), mean = mean(x), var = var(x) )
- expect_identical( res$x, res$min )
- expect_identical( res$x, res$max )
- expect_identical( res$x, res$mean )
- expect_identical( res$var, rep(NA_real_, 2) )
+ mean_x = mean(x, na.rm = TRUE),
+ mean_z = mean(z, na.rm = TRUE),
+ min_x = min(x, na.rm = TRUE),
+ min_z = min(z, na.rm = TRUE)
+ )
+ expect_equal(res$mean_x[1], 3)
+ expect_equal(res$mean_x[2], 8.5)
+ expect_equal(res$mean_z[1], mean(df$z[1:5]))
+ expect_equal(res$mean_z[2], mean(df$z[7:10]))
+
+ expect_equal(res$min_x[1], 1)
+ expect_equal(res$min_x[2], 7)
+ expect_equal(res$min_z[1], min(df$z[1:5]))
+ expect_equal(res$min_z[2], min(df$z[7:10]))
+
+})
+
+test_that("summarise hybrid functions can use summarized variables", {
+ df <- data.frame(x = c(1:5, NA, 7:10), y = rep(1:2, each = 5)) %>% group_by(y)
+ res <- summarise(
+ df,
+ x = mean(x),
+ min = min(x),
+ max = max(x),
+ mean = mean(x),
+ var = var(x)
+ )
+ expect_identical(res$x, res$min)
+ expect_identical(res$x, res$max)
+ expect_identical(res$x, res$mean)
+ expect_identical(res$var, rep(NA_real_, 2))
})
-test_that( "LazySubset is not confused about input data size (#452)", {
+test_that("LazySubset is not confused about input data size (#452)", {
res <- data.frame(a = c(10, 100)) %>% summarise(b = sum(a), c = sum(a) * 2)
expect_equal(res$b, 110)
expect_equal(res$c, 220)
})
-test_that( "nth, first, last promote dates and times (#509)", {
+test_that("nth, first, last promote dates and times (#509)", {
data <- data_frame(
- ID = rep(letters[1:4],each=5),
+ ID = rep(letters[1:4], each = 5),
date = Sys.Date() + 1:20,
time = Sys.time() + 1:20,
number = rnorm(20)
)
- res <- data %>% group_by(ID) %>% summarise(
- date2 = nth(date,2), time2 = nth(time,2),
- first_date = first(date), last_date = last(date),
- first_time = first(time), last_time = last(time)
+ res <- data %>%
+ group_by(ID) %>%
+ summarise(
+ date2 = nth(date, 2),
+ time2 = nth(time, 2),
+ first_date = first(date),
+ last_date = last(date),
+ first_time = first(time),
+ last_time = last(time)
)
expect_is(res$date2, "Date")
expect_is(res$first_date, "Date")
@@ -306,12 +418,19 @@ test_that( "nth, first, last promote dates and times (#509)", {
expect_is(res$time2, "POSIXct")
expect_is(res$first_time, "POSIXct")
expect_is(res$last_time, "POSIXct")
- expect_error(data %>% group_by(ID) %>% summarise(time2 = nth(times,2)) )
+ # error messages from rlang
+ expect_error(data %>% group_by(ID) %>% summarise(time2 = nth(times, 2)))
})
-test_that( "nth, first, last preserves factor data (#509)", {
- dat <- data_frame(a = rep(seq(1,20,2),3),b = as.ordered(a))
- dat1 <- dat %>% group_by(a) %>% summarise(der = nth(b,2), first = first(b), last = last(b) )
+test_that("nth, first, last preserves factor data (#509)", {
+ dat <- data_frame(a = rep(seq(1, 20, 2), 3), b = as.ordered(a))
+ dat1 <- dat %>%
+ group_by(a) %>%
+ summarise(
+ der = nth(b, 2),
+ first = first(b),
+ last = last(b)
+ )
expect_is(dat1$der, "ordered")
expect_is(dat1$first, "ordered")
expect_is(dat1$last, "ordered")
@@ -319,33 +438,46 @@ test_that( "nth, first, last preserves factor data (#509)", {
})
test_that("nth handle negative value (#1584) ", {
- df <- data.frame( a = 1:10, b = 10:1, g = rep(c(1,2), c(4,6)) ) %>% group_by(g)
-
- res <- summarise( df,
- x1 = nth(a,-1L),
- x2 = nth(a,-1L, order_by=b),
+ df <- data.frame(
+ a = 1:10, b = 10:1,
+ g = rep(c(1, 2), c(4, 6))
+ ) %>%
+ group_by(g)
+
+ res <- summarise(
+ df,
+ x1 = nth(a, -1L),
+ x2 = nth(a, -1L, order_by = b),
x3 = nth(a, -5L),
- x4 = nth(a, -5L, order_by=b),
+ x4 = nth(a, -5L, order_by = b),
x5 = nth(a, -5L, default = 99),
- x6 = nth(a, -5L, order_by=b, default = 99)
+ x6 = nth(a, -5L, order_by = b, default = 99)
)
- expect_equal( res$x1, c(4,10) )
- expect_equal( res$x2, c(1,5) )
- expect_true( is.na(res$x3[1]) )
- expect_equal( res$x3[2], 6 )
- expect_true( is.na(res$x4[1]) )
- expect_equal( res$x4[2], 9 )
- expect_equal( res$x5, c(99,6) )
- expect_equal( res$x6, c(99,9) )
+ expect_equal(res$x1, c(4, 10))
+ expect_equal(res$x2, c(1, 5))
+ expect_true(is.na(res$x3[1]))
+ expect_equal(res$x3[2], 6)
+ expect_true(is.na(res$x4[1]))
+ expect_equal(res$x4[2], 9)
+ expect_equal(res$x5, c(99, 6))
+ expect_equal(res$x6, c(99, 9))
})
-test_that( "LazyGroupSubsets is robust about columns not from the data (#600)", {
+test_that("LazyGroupSubsets is robust about columns not from the data (#600)", {
foo <- data_frame(x = 1:10, y = 1:10)
- expect_error( foo %>% group_by(x) %>% summarise(first_y = first(z)), "could not find variable" )
+ # error messages from rlang
+ expect_error(foo %>% group_by(x) %>% summarise(first_y = first(z)))
+})
+
+test_that("can summarise first(x[-1]) (#1980)", {
+ expect_equal(
+ tibble(x = 1:3) %>% summarise(f = first(x[-1])),
+ tibble(f = 2L)
+ )
})
-test_that( "hybrid eval handles $ and @ (#645)", {
+test_that("hybrid eval handles $ and @ (#645)", {
tmp <- expand.grid(a = 1:3, b = 0:1, i = 1:10)
g <- tmp %>% group_by(a)
@@ -358,50 +490,52 @@ test_that( "hybrid eval handles $ and @ (#645)", {
n = length(b),
p = f(r, n)$x[1]
)
- expect_equal(names(res), c("a", "r", "n", "p" ))
+ expect_equal(names(res), c("a", "r", "n", "p"))
res <- tmp %>% summarise(
r = sum(b),
n = length(b),
p = f(r, n)$x[1]
)
- expect_equal(names(res), c("r", "n", "p" ))
+ expect_equal(names(res), c("r", "n", "p"))
})
-test_that( "argument order_by in last is flexible enough to handle more than just a symbol (#626)", {
- res1 <- summarize(group_by(mtcars,cyl),
- big=last(mpg[drat>3],order_by=wt[drat>3]),
- small=first(mpg[drat>3],order_by=wt[drat>3]),
- second=nth(mpg[drat>3],2,order_by=wt[drat>3])
- )
+test_that("argument order_by in last is flexible enough to handle more than just a symbol (#626)", {
+ res1 <- group_by(mtcars, cyl) %>%
+ summarise(
+ big = last(mpg[drat > 3], order_by = wt[drat > 3]),
+ small = first(mpg[drat > 3], order_by = wt[drat > 3]),
+ second = nth(mpg[drat > 3], 2, order_by = wt[drat > 3])
+ )
# turning off lazy eval
last. <- last
first. <- first
nth. <- nth
- res2 <- summarize(group_by(mtcars,cyl),
- big=last.(mpg[drat>3],order_by=wt[drat>3]),
- small=first.(mpg[drat>3],order_by=wt[drat>3]),
- second=nth.(mpg[drat>3],2,order_by=wt[drat>3])
- )
+ res2 <- group_by(mtcars, cyl) %>%
+ summarise(
+ big = last.(mpg[drat > 3], order_by = wt[drat > 3]),
+ small = first.(mpg[drat > 3], order_by = wt[drat > 3]),
+ second = nth.(mpg[drat > 3], 2, order_by = wt[drat > 3])
+ )
expect_equal(res1, res2)
})
-test_that("min(., na.rm=TRUE) correctly handles Dates that are coded as REALSXP (#755)",{
+test_that("min(., na.rm=TRUE) correctly handles Dates that are coded as REALSXP (#755)", {
dates <- as.Date(c("2014-01-01", "2013-01-01"))
dd <- data.frame(Dates = dates)
- res <- summarise(dd, Dates = min(Dates, na.rm=TRUE))
- expect_is( res$Dates, "Date" )
- expect_equal( res$Dates, as.Date("2013-01-01"))
+ res <- summarise(dd, Dates = min(Dates, na.rm = TRUE))
+ expect_is(res$Dates, "Date")
+ expect_equal(res$Dates, as.Date("2013-01-01"))
})
test_that("nth handles expressions for n argument (#734)", {
df <- data.frame(x = c(1:4, 7:9, 13:19), y = sample(100:999, 14))
- idx <- which( df$x == 16 )
- res <- df %>% summarize(abc = nth(y, n = which(x == 16)) )
- expect_equal( res$abc, df$y[idx])
+ idx <- which(df$x == 16)
+ res <- df %>% summarize(abc = nth(y, n = which(x == 16)))
+ expect_equal(res$abc, df$y[idx])
})
test_that("summarise is not polluted by logical NA (#599)", {
@@ -410,77 +544,81 @@ test_that("summarise is not polluted by logical NA (#599)", {
res <- mean(x, na.rm = TRUE)
if (res > thresh) res else NA
}
- res <- dat %>% group_by(grp) %>% summarise( val = Mean(val, thresh = 2))
- expect_is( res$val, "numeric" )
- expect_true( is.na(res$val[1]) )
+ res <- dat %>% group_by(grp) %>% summarise(val = Mean(val, thresh = 2))
+ expect_is(res$val, "numeric")
+ expect_true(is.na(res$val[1]))
})
test_that("summarise handles list output columns (#832)", {
- df <- data_frame( x = 1:10, g = rep(1:2, each = 5) )
- res <- df %>% group_by(g) %>% summarise(y=list(x))
- expect_equal( res$y[[1]], 1:5)
- expect_equal( res$y[[2]], 6:10)
+ df <- data_frame(x = 1:10, g = rep(1:2, each = 5))
+ res <- df %>% group_by(g) %>% summarise(y = list(x))
+ expect_equal(res$y[[1]], 1:5)
+ expect_equal(res$y[[2]], 6:10)
# just checking objects are not messed up internally
- expect_equal( gp(res$y[[1]]), 0L )
- expect_equal( gp(res$y[[2]]), 0L )
+ expect_equal(gp(res$y[[1]]), 0L)
+ expect_equal(gp(res$y[[2]]), 0L)
- res <- df %>% group_by(g) %>% summarise(y=list(x+1))
- expect_equal( res$y[[1]], 1:5+1)
- expect_equal( res$y[[2]], 6:10+1)
+ res <- df %>% group_by(g) %>% summarise(y = list(x + 1))
+ expect_equal(res$y[[1]], 1:5 + 1)
+ expect_equal(res$y[[2]], 6:10 + 1)
# just checking objects are not messed up internally
- expect_equal( gp(res$y[[1]]), 0L )
- expect_equal( gp(res$y[[2]]), 0L )
+ expect_equal(gp(res$y[[1]]), 0L)
+ expect_equal(gp(res$y[[2]]), 0L)
- df <- data_frame( x = 1:10, g = rep(1:2, each = 5) )
- res <- df %>% summarise(y=list(x))
- expect_equal( res$y[[1]], 1:10 )
- res <- df %>% summarise(y=list(x+1))
- expect_equal( res$y[[1]], 1:10+1)
+ df <- data_frame(x = 1:10, g = rep(1:2, each = 5))
+ res <- df %>% summarise(y = list(x))
+ expect_equal(res$y[[1]], 1:10)
+ res <- df %>% summarise(y = list(x + 1))
+ expect_equal(res$y[[1]], 1:10 + 1)
})
test_that("summarise works with empty data frame (#1142)", {
df <- data.frame()
res <- df %>% summarise
- expect_equal( nrow(res), 0L )
- expect_equal( length(res), 0L )
+ expect_equal(nrow(res), 0L)
+ expect_equal(length(res), 0L)
})
test_that("n_distint uses na.rm argument", {
- df <- data.frame( x = c(1:3,NA), g = rep(1:2,2) )
- res <- summarise( df, n = n_distinct(x, na.rm = TRUE) )
- expect_equal( res$n, 3L )
+ df <- data.frame(x = c(1:3, NA), g = rep(1:2, 2))
+ res <- summarise(df, n = n_distinct(x, na.rm = TRUE))
+ expect_equal(res$n, 3L)
- res <- group_by(df, g) %>% summarise( n = n_distinct(x, na.rm = TRUE) )
- expect_equal( res$n, c(2L,1L) )
+ res <- group_by(df, g) %>% summarise(n = n_distinct(x, na.rm = TRUE))
+ expect_equal(res$n, c(2L, 1L))
})
test_that("n_distinct front end supports na.rm argument (#1052)", {
x <- c(1:3, NA)
- expect_equal( n_distinct(x, na.rm = TRUE), 3L )
+ expect_equal(n_distinct(x, na.rm = TRUE), 3L)
})
test_that("n_distinct without arguments stops (#1957)", {
- expect_error( n_distinct(), "at least one column for n_distinct" )
+ expect_error(
+ n_distinct(),
+ "Need at least one column for `n_distinct()`",
+ fixed = TRUE
+ )
})
test_that("hybrid evaluation does not take place for objects with a class (#1237)", {
mean.foo <- function(x) 42
- df <- data_frame( x = structure(1:10, class = "foo" ) )
- expect_equal( summarise(df, m = mean(x))$m[1], 42 )
+ df <- data_frame(x = structure(1:10, class = "foo"))
+ expect_equal(summarise(df, m = mean(x))$m[1], 42)
env <- environment()
- Foo <- suppressWarnings( setClass("Foo", contains = "numeric", where = env) )
- suppressMessages( setMethod( "mean", "Foo", function(x, ...) 42 , where = env) )
+ Foo <- suppressWarnings(setClass("Foo", contains = "numeric", where = env))
+ suppressMessages(setMethod("mean", "Foo", function(x, ...) 42, where = env))
on.exit(removeClass("Foo", where = env))
- df <- data.frame( x = Foo(c(1, 2, 3)) )
- expect_equal( summarise( df, m = mean(x) )$m[1], 42 )
+ df <- data.frame(x = Foo(c(1, 2, 3)))
+ expect_equal(summarise(df, m = mean(x))$m[1], 42)
})
test_that("summarise handles promotion of results (#893)", {
- df <- structure( list(
+ df <- structure(list(
price = c(580L, 650L, 630L, 706L, 1080L, 3082L, 3328L, 4229L, 1895L,
3546L, 752L, 13003L, 814L, 6115L, 645L, 3749L, 2926L, 765L,
1140L, 1158L),
@@ -488,7 +626,7 @@ test_that("summarise handles promotion of results (#893)", {
4L, 3L, 3L, 1L, 2L, 2L, 2L),
.Label = c("Good", "Ideal", "Premium", "Very Good"),
class = "factor")),
- row.names = c(NA,-20L),
+ row.names = c(NA, -20L),
.Names = c("price", "cut"),
class = "data.frame"
)
@@ -496,25 +634,27 @@ test_that("summarise handles promotion of results (#893)", {
group_by(cut) %>%
select(price) %>%
summarise(price = median(price))
- expect_is( res$price, "numeric" )
+ expect_is(res$price, "numeric")
})
-test_that("summarise correctly handles logical (#1291)",{
+test_that("summarise correctly handles logical (#1291)", {
test <- expand.grid(id = 1:2, type = letters[1:2], sample = 1:2) %>%
- mutate(var = c(1, 0, 1, 1, 0, 0, 0, 1)) %>%
- mutate(var_l = as.logical(var)) %>%
- mutate(var_ch = as.character(var_l)) %>%
- arrange(id, type, sample) %>%
- group_by(id, type)
+ mutate(var = c(1, 0, 1, 1, 0, 0, 0, 1)) %>%
+ mutate(var_l = as.logical(var)) %>%
+ mutate(var_ch = as.character(var_l)) %>%
+ arrange(id, type, sample) %>%
+ group_by(id, type)
test_sum <- test %>%
- ungroup() %>%
- group_by(id, type) %>%
- summarise(anyvar = any(var == 1),
- anyvar_l = any(var_l),
- anyvar_ch = any(var_ch == "TRUE"))
+ ungroup() %>%
+ group_by(id, type) %>%
+ summarise(
+ anyvar = any(var == 1),
+ anyvar_l = any(var_l),
+ anyvar_ch = any(var_ch == "TRUE")
+ )
- expect_equal( test_sum$anyvar, c(TRUE,TRUE,FALSE,TRUE) )
+ expect_equal(test_sum$anyvar, c(TRUE, TRUE, FALSE, TRUE))
})
@@ -526,55 +666,60 @@ test_that("summarise correctly handles NA groups (#1261)", {
)
res <- tmp %>% group_by(a, b1) %>% summarise(n())
- expect_equal( nrow(res), 2L)
+ expect_equal(nrow(res), 2L)
res <- tmp %>% group_by(a, b2) %>% summarise(n())
- expect_equal( nrow(res), 2L)
+ expect_equal(nrow(res), 2L)
})
test_that("n_distinct handles multiple columns (#1084)", {
- df <- data.frame( x = rep(1:4, each = 2), y = rep(1:2, each = 4), g = rep(1:2, 4))
- res <- summarise( df, n = n_distinct(x,y) )
- expect_equal( res$n, 4L)
+ df <- data.frame(
+ x = rep(1:4, each = 2),
+ y = rep(1:2, each = 4),
+ g = rep(1:2, 4)
+ )
+ res <- summarise(df, n = n_distinct(x, y))
+ expect_equal(res$n, 4L)
- res <- group_by(df, g) %>% summarise( n = n_distinct(x,y) )
- expect_equal( res$n, c(4L,4L) )
+ res <- group_by(df, g) %>% summarise(n = n_distinct(x, y))
+ expect_equal(res$n, c(4L, 4L))
df$x[3] <- df$y[7] <- NA
- res <- summarise( df, n = n_distinct(x,y) )
- expect_equal( res$n, 6L)
- res <- summarise( df, n = n_distinct(x,y, na.rm=TRUE) )
- expect_equal( res$n, 4L)
+ res <- summarise(df, n = n_distinct(x, y))
+ expect_equal(res$n, 6L)
+ res <- summarise(df, n = n_distinct(x, y, na.rm = TRUE))
+ expect_equal(res$n, 4L)
- res <- group_by(df, g) %>% summarise( n = n_distinct(x, y) )
- expect_equal( res$n, c(4L,4L) )
+ res <- group_by(df, g) %>% summarise(n = n_distinct(x, y))
+ expect_equal(res$n, c(4L, 4L))
- res <- group_by(df, g) %>% summarise( n = n_distinct(x, y, na.rm = TRUE) )
- expect_equal( res$n, c(2L,4L) )
-})
-
-test_that("n_distinct stops if no columns are passed (#1957)", {
- df <- data.frame( x = rep(1:4, each = 2), y = rep(1:2, each = 4), g = rep(1:2, 4))
- expect_error(summarise( df, nd = n_distinct(), n = n()), "at least one column for n_distinct" )
+ res <- group_by(df, g) %>% summarise(n = n_distinct(x, y, na.rm = TRUE))
+ expect_equal(res$n, c(2L, 4L))
})
test_that("hybrid max works when not used on columns (#1369)", {
df <- data_frame(x = 1:1000)
y <- 1:10
- expect_equal( summarise(df, z = max(y))$z, 10 )
- expect_equal( summarise(df, z = max(10))$z, 10 )
+ expect_equal(summarise(df, z = max(y))$z, 10)
+ expect_equal(summarise(df, z = max(10))$z, 10)
})
-test_that( "min and max handle empty sets in summarise (#1481)", {
- df <- data_frame(A=numeric())
- res <- df %>% summarise(Min=min(A, na.rm=T), Max = max(A, na.rm=TRUE))
- expect_equal( res$Min, Inf )
- expect_equal( res$Max, -Inf )
+test_that("min and max handle empty sets in summarise (#1481)", {
+ df <- data_frame(A = numeric())
+ res <- df %>% summarise(Min = min(A, na.rm = TRUE), Max = max(A, na.rm = TRUE))
+ expect_equal(res$Min, Inf)
+ expect_equal(res$Max, -Inf)
})
test_that("lead and lag behave correctly in summarise (#1434)", {
res <- mtcars %>%
group_by(cyl) %>%
- summarise(n = n(), leadn = lead(n), lagn=lag(n), leadn10=lead(n, default=10), lagn10 = lag(n, default = 10))
+ summarise(
+ n = n(),
+ leadn = lead(n),
+ lagn = lag(n),
+ leadn10 = lead(n, default = 10),
+ lagn10 = lag(n, default = 10)
+ )
expect_true(all(is.na(res$lagn)))
expect_true(all(is.na(res$leadn)))
expect_true(all(res$lagn10 == 10))
@@ -582,7 +727,13 @@ test_that("lead and lag behave correctly in summarise (#1434)", {
res <- mtcars %>%
rowwise() %>%
- summarise(n = n(), leadn = lead(n), lagn=lag(n), leadn10=lead(n, default=10), lagn10 = lag(n, default = 10))
+ summarise(
+ n = n(),
+ leadn = lead(n),
+ lagn = lag(n),
+ leadn10 = lead(n, default = 10),
+ lagn10 = lag(n, default = 10)
+ )
expect_true(all(is.na(res$lagn)))
expect_true(all(is.na(res$leadn)))
expect_true(all(res$lagn10 == 10))
@@ -590,45 +741,43 @@ test_that("lead and lag behave correctly in summarise (#1434)", {
})
-test_that("summarise understands column. #1012", {
- ir1 <- summarise( iris, Sepal = sum(Sepal.Length * Sepal.Width) )
- ir2 <- summarise( iris, Sepal = sum(column("Sepal.Length") * column("Sepal.Width")) )
- expect_equal(ir1, ir2)
-
- ir1 <- summarise( group_by(iris, Species), Sepal = sum(Sepal.Length * Sepal.Width) )
- ir2 <- summarise( group_by(iris, Species), Sepal = sum(column("Sepal.Length") * column("Sepal.Width")) )
- expect_equal(ir1, ir2)
-})
+# .data and .env tests now in test-hybrid-traverse.R
-test_that("data.frame columns are supported in summarise (#1425)" , {
+test_that("data.frame columns are supported in summarise (#1425)", {
df <- data.frame(x1 = rep(1:3, times = 3), x2 = 1:9)
df$x3 <- df %>% mutate(x3 = x2)
res <- df %>% group_by(x1) %>% summarise(nr = nrow(x3))
- expect_true(all(res$nr==3))
+ expect_true(all(res$nr == 3))
})
test_that("summarise handles min/max of already summarised variable (#1622)", {
df <- data.frame(
- FIRST_DAY=rep(seq(as.POSIXct("2015-12-01", tz="UTC"), length.out=2, by="days"),2),
- event=c("a","a","b","b")
+ FIRST_DAY = rep(seq(as.POSIXct("2015-12-01", tz = "UTC"), length.out = 2, by = "days"), 2),
+ event = c("a", "a", "b", "b")
)
- df_summary <- df %>% group_by(event) %>% summarise(FIRST_DAY=min(FIRST_DAY), LAST_DAY=max(FIRST_DAY))
+ df_summary <- df %>%
+ group_by(event) %>%
+ summarise(FIRST_DAY = min(FIRST_DAY), LAST_DAY = max(FIRST_DAY))
expect_equal(df_summary$FIRST_DAY, df_summary$LAST_DAY)
})
test_that("group_by keeps classes (#1631)", {
- df <- data.frame(a=1, b=as.Date(NA)) %>% group_by(a) %>% summarize(c=min(b))
- expect_equal( class(df$c), "Date")
+ df <- data.frame(a = 1, b = as.Date(NA)) %>%
+ group_by(a) %>%
+ summarize(c = min(b))
+ expect_equal(class(df$c), "Date")
- df <- data.frame(a=1, b=as.POSIXct(NA)) %>% group_by(a) %>% summarize(c=min(b))
- expect_equal( class(df$c), c( "POSIXct", "POSIXt") )
+ df <- data.frame(a = 1, b = as.POSIXct(NA)) %>%
+ group_by(a) %>%
+ summarize(c = min(b))
+ expect_equal(class(df$c), c("POSIXct", "POSIXt"))
})
test_that("hybrid n_distinct falls back to R evaluation when needed (#1657)", {
- dat3 <- data.frame(id = c(2,6,7,10,10))
- res <- dat3 %>% summarise(n_unique = n_distinct(id[id>6]))
+ dat3 <- data.frame(id = c(2, 6, 7, 10, 10))
+ res <- dat3 %>% summarise(n_unique = n_distinct(id[id > 6]))
expect_equal(res$n_unique, 2)
})
@@ -636,12 +785,12 @@ test_that("summarise() correctly coerces factors with different levels (#1678)",
res <- data_frame(x = 1:3) %>%
group_by(x) %>%
summarise(
- y = if(x == 1) "a" else "b",
+ y = if (x == 1) "a" else "b",
z = factor(y)
)
- expect_is( res$z, "factor")
- expect_equal( levels(res$z), c("a", "b") )
- expect_equal( as.character(res$z), c("a", "b", "b") )
+ expect_is(res$z, "factor")
+ expect_equal(levels(res$z), c("a", "b"))
+ expect_equal(as.character(res$z), c("a", "b", "b"))
})
test_that("summarise works if raw columns exist but are not involved (#1803)", {
@@ -651,5 +800,190 @@ test_that("summarise works if raw columns exist but are not involved (#1803)", {
test_that("summarise fails gracefully on raw columns (#1803)", {
df <- data_frame(a = 1:3, b = as.raw(1:3))
- expect_error( summarise(df, c = b[[1]]), 'Unsupported type RAWSXP for column "c"' )
+ expect_error(
+ summarise(df, c = b[[1]]),
+ "Column `c` is of unsupported type raw vector",
+ fixed = TRUE
+ )
+})
+
+test_that("dim attribute is stripped from grouped summarise (#1918)", {
+ df <- data.frame(a = 1:3, b = 1:3)
+
+ df_regular <- summarise(df, b = scale(b)[1, 1])
+ df_grouped <- summarise(group_by(df, a), b = scale(b))
+ df_rowwise <- summarise(rowwise(df), b = scale(b))
+
+ expect_null(dim(df$b))
+ expect_null(dim(df_grouped$b))
+ expect_null(dim(df_rowwise$b))
+})
+
+test_that("typing and NAs for grouped summarise (#1839)", {
+ expect_identical(
+ data_frame(id = 1L, a = NA_character_) %>%
+ group_by(id) %>%
+ summarise(a = a[[1]]) %>%
+ .$a,
+ NA_character_)
+
+ expect_identical(
+ data_frame(id = 1:2, a = c(NA, "a")) %>%
+ group_by(id) %>%
+ summarise(a = a[[1]]) %>%
+ .$a,
+ c(NA, "a"))
+
+ # Properly upgrade NA (logical) to character
+ expect_identical(
+ data_frame(id = 1:2, a = 1:2) %>%
+ group_by(id) %>%
+ summarise(a = ifelse(all(a < 2), NA, "yes")) %>%
+ .$a,
+ c(NA, "yes"))
+
+ expect_error(
+ data_frame(id = 1:2, a = list(1, "2")) %>%
+ group_by(id) %>%
+ summarise(a = a[[1]]) %>%
+ .$a,
+ "Column `a` can't promote group 1 to numeric",
+ fixed = TRUE
+ )
+
+ expect_identical(
+ data_frame(id = 1:2, a = list(1, "2")) %>%
+ group_by(id) %>%
+ summarise(a = a[1]) %>%
+ .$a,
+ list(1, "2"))
+})
+
+test_that("typing and NAs for rowwise summarise (#1839)", {
+ expect_identical(
+ data_frame(id = 1L, a = NA_character_) %>%
+ rowwise %>%
+ summarise(a = a[[1]]) %>%
+ .$a,
+ NA_character_)
+
+ expect_identical(
+ data_frame(id = 1:2, a = c(NA, "a")) %>%
+ rowwise %>%
+ summarise(a = a[[1]]) %>%
+ .$a,
+ c(NA, "a"))
+
+ # Properly promote NA (logical) to character
+ expect_identical(
+ data_frame(id = 1:2, a = 1:2) %>%
+ group_by(id) %>%
+ summarise(a = ifelse(all(a < 2), NA, "yes")) %>%
+ .$a,
+ c(NA, "yes"))
+
+ expect_error(
+ data_frame(id = 1:2, a = list(1, "2")) %>%
+ rowwise %>%
+ summarise(a = a[[1]]) %>%
+ .$a,
+ "Column `a` can't promote group 1 to numeric",
+ fixed = TRUE
+ )
+
+ expect_error(
+ data_frame(id = 1:2, a = list(1, "2")) %>%
+ rowwise %>%
+ summarise(a = a[1]) %>%
+ .$a,
+ "Column `a` can't promote group 1 to numeric",
+ fixed = TRUE
+ )
+})
+
+test_that("calculating an ordered factor preserves order (#2200)", {
+ test_df <- tibble(
+ id = c("a", "b"),
+ val = 1:2
+ )
+
+ ret <- group_by(test_df, id) %>%
+ summarize(level = ordered(val))
+
+ expect_s3_class(ret$level, "ordered")
+ expect_equal(levels(ret$level), c("1", "2"))
+})
+
+test_that("min, max preserves ordered factor data (#2200)", {
+ test_df <- tibble(
+ id = rep(c("a", "b"), 2),
+ ord = ordered(c("A", "B", "B", "A"), levels = c("A", "B"))
+ )
+
+ ret <- group_by(test_df, id) %>%
+ summarize(
+ min_ord = min(ord),
+ max_ord = max(ord)
+ )
+
+ expect_s3_class(ret$min_ord, "ordered")
+ expect_s3_class(ret$max_ord, "ordered")
+ expect_equal(levels(ret$min_ord), levels(test_df$ord))
+ expect_equal(levels(ret$max_ord), levels(test_df$ord))
+})
+
+test_that("ungrouped summarise() uses summary variables correctly (#2404)", {
+ df <- tibble::as_tibble(seq(1:10))
+
+ out <- df %>% summarise(value = mean(value), sd = sd(value))
+ expect_equal(out$value, 5.5)
+ expect_equal(out$sd, NA_real_)
+})
+
+test_that("proper handling of names in summarised list columns (#2231)", {
+ d <- data_frame(x = rep(1:3, 1:3), y = 1:6, names = letters[1:6])
+ res <- d %>% group_by(x) %>% summarise(y = list(setNames(y, names)))
+ expect_equal(names(res$y[[1]]), letters[[1]])
+ expect_equal(names(res$y[[2]]), letters[2:3])
+ expect_equal(names(res$y[[3]]), letters[4:6])
+})
+
+test_that("proper handling of NA factors (#2588)", {
+ df <- tibble(
+ x = c(1, 1, 2, 2, 3, 3),
+ y = factor(c(NA, NA, NA, "2", "3", "3"))
+ )
+
+ ret <- df %>% group_by(x) %>% summarise(y = y[1])
+ expect_identical(as.character(ret$y), c(NA, NA, "3"))
+})
+
+test_that("can refer to previously summarised symbols", {
+ expect_identical(summarise(group_by(mtcars, cyl), x = 1, z = x)[2:3], tibble(x = c(1, 1, 1), z = x))
+ expect_identical(summarise(group_by(mtcars, cyl), x = n(), z = x)[2:3], tibble(x = c(11L, 7L, 14L), z = x))
+})
+
+test_that("can refer to symbols if group size is one overall", {
+ df <- tibble(x = LETTERS[3:1], y = 1:3)
+ expect_identical(
+ df %>%
+ group_by(x) %>%
+ summarise(z = y),
+ tibble(x = LETTERS[1:3], z = 3:1)
+ )
+})
+
+test_that("summarise() supports unquoted values", {
+ df <- tibble(g = c(1, 1, 2, 2, 2), x = 1:5)
+ expect_identical(summarise(df, out = !! 1), tibble(out = 1))
+ expect_identical(summarise(df, out = !! quote(identity(1))), tibble(out = 1))
+ expect_error(summarise(df, out = !! 1:2), "must be length 1 (the number of groups)", fixed = TRUE)
+ expect_error(summarise(df, out = !! env(a = 1)), "unsupported type")
+
+ gdf <- group_by(df, g)
+ expect_identical(summarise(gdf, out = !! 1), summarise(gdf, out = 1))
+ expect_identical(summarise(gdf, out = !! 1:2), tibble(g = c(1, 2), out = 1:2))
+ expect_identical(summarise(gdf, out = !! quote(identity(1))), summarise(gdf, out = 1))
+ expect_error(summarise(gdf, out = !! 1:5), "must be length 2 (the number of groups)", fixed = TRUE)
+ expect_error(summarise(gdf, out = !! env(a = 1)), "unsupported type")
})
diff --git a/tests/testthat/test-tally.R b/tests/testthat/test-tally.R
deleted file mode 100644
index ad6f35d..0000000
--- a/tests/testthat/test-tally.R
+++ /dev/null
@@ -1,7 +0,0 @@
-context("tally")
-
-test_that("weighted tally drops NAs (#1145)", {
- df <- data_frame(x = c(1, 1, NA))
-
- expect_equal(tally(df, x)$n, 2)
-})
diff --git a/tests/testthat/test-tbl-cube.R b/tests/testthat/test-tbl-cube.R
index 7dd05e3..bba7151 100644
--- a/tests/testthat/test-tbl-cube.R
+++ b/tests/testthat/test-tbl-cube.R
@@ -1,11 +1,37 @@
context("tbl_cube")
+test_that("construction errors", {
+ expect_error(
+ tbl_cube(1:3, 1:3),
+ "`dimensions` must be a named list of vectors, not integer",
+ fixed = TRUE
+ )
+
+ expect_error(
+ tbl_cube(list(a = 1:3), 1:3),
+ "`measures` must be a named list of arrays, not integer",
+ fixed = TRUE
+ )
+
+ expect_error(
+ tbl_cube(list(a = 1:3), list(b = 1:3)),
+ "`measures` must be a named list of arrays, not list",
+ fixed = TRUE
+ )
+
+ expect_error(
+ tbl_cube(list(a = 1:3), list(b = array(1:3), c = array(1:2))),
+ "Measure `c` needs dimensions [3], not [2]",
+ fixed = TRUE
+ )
+})
+
test_that("coercion", {
- grid <- expand.grid(x=letters[1:3], y=letters[1:5], stringsAsFactors = FALSE)
- tbl <- table(x=grid$x, y=grid$y)
+ grid <- expand.grid(x = letters[1:3], y = letters[1:5], stringsAsFactors = FALSE)
+ tbl <- table(x = grid$x, y = grid$y)
tbl_as_df <- as.data.frame(tbl, stringsAsFactors = FALSE)
expect_message(cube <- as.tbl_cube(tbl_as_df), "Using Freq as")
- expect_identical(cube$dims, list(x=letters[1:3], y=letters[1:5]))
+ expect_identical(cube$dims, list(x = letters[1:3], y = letters[1:5]))
expect_identical(names(cube$mets), "Freq")
expect_message(cube_met <- as.tbl_cube(tbl_as_df, met_name = "Freq"), NA)
@@ -19,10 +45,11 @@ test_that("coercion", {
})
test_that("incomplete", {
- d <- rbind(cbind(data_frame(s=1), expand.grid(j=1)),
- cbind(data_frame(s=2), expand.grid(j=1:2)))
+ d <- rbind(
+ cbind(data.frame(s = 1), expand.grid(j = 1)),
+ cbind(data.frame(s = 2), expand.grid(j = 1:2))
+ )
d$value <- 1:3
- d <- as_data_frame(d)
cube <- as.tbl_cube(d, met_name = "value")
expect_true(is.na(as.data.frame(filter(cube, s == 1, j == 2))[["value"]]))
@@ -30,11 +57,39 @@ test_that("incomplete", {
})
test_that("duplicate", {
- d <- rbind(cbind(data_frame(s=1), expand.grid(j=c(1, 1))),
- cbind(data_frame(s=2), expand.grid(j=1:2)))
+ d <- rbind(
+ cbind(data.frame(s = 1), expand.grid(j = c(1, 1))),
+ cbind(data.frame(s = 2), expand.grid(j = 1:2))
+ )
d$value <- 1:4
- expect_error(as.tbl_cube(d, met_name = "value"), "Duplicate.*s = 1, j = 1")
+ expect_error(
+ as.tbl_cube(d, met_name = "value"),
+ "`x` must be unique in all combinations of dimension variables, duplicates: `s` = 1, `j` = 1",
+ fixed = TRUE
+ )
+})
+
+test_that("filter", {
+ expect_equal(
+ nasa %>% filter(month == 1) %>% filter(year == 2000),
+ nasa %>% filter(year == 2000) %>% filter(month == 1)
+ )
+
+ expect_equal(
+ nasa %>% filter(month == 1) %>% filter(year == 2000),
+ filter(nasa, month == 1, year == 2000)
+ )
+
+ expect_equal(
+ filter(nasa, month == 1, year == 2000),
+ filter(nasa, year == 2000, month == 1)
+ )
+
+ expect_error(
+ filter(nasa, month == 1 & year == 2000),
+ "`month == 1 & year == 2000` must refer to exactly one dimension, not `month`, `year`"
+ )
})
test_that("summarise works with single group", {
@@ -50,8 +105,10 @@ test_that("summarise works with single group", {
test_that("can coerce to data_frame", {
slice <- filter(nasa, year == 1995L, month == 1L)
- expect_identical(tbl_df(as.data.frame(slice, stringsAsFactors = FALSE)),
- as_data_frame(slice))
+ expect_identical(
+ tbl_df(as.data.frame(slice, stringsAsFactors = FALSE)),
+ as_data_frame(slice)
+ )
})
test_that("can coerce to table", {
@@ -61,3 +118,8 @@ test_that("can coerce to table", {
expect_equal(as.vector(as.table(nasa)), as.vector(nasa$mets[[1]]))
expect_identical(as.table(nasa, measure = "ozone"), as.table(select(nasa, ozone)))
})
+
+test_that("group_vars() returns variables", {
+ gcube <- group_by(nasa, month)
+ expect_identical(group_vars(gcube), "month")
+})
diff --git a/tests/testthat/test-tbl-sql.r b/tests/testthat/test-tbl-sql.r
deleted file mode 100644
index 5f81e01..0000000
--- a/tests/testthat/test-tbl-sql.r
+++ /dev/null
@@ -1,11 +0,0 @@
-context("tbl_sql")
-
-test_that("can generate sql tbls with raw sql", {
- df <- data_frame(x = 1:3, y = 3:1)
- tbls <- test_load(df, ignore = "df")
-
- test_f <- function(tbl) {
- clone <- tbl(x$src, build_sql("SELECT * FROM ", x$from))
- expect_equal(collect(tbl), collect(clone))
- }
-})
diff --git a/tests/testthat/test-tbl.R b/tests/testthat/test-tbl.R
new file mode 100644
index 0000000..9897c1d
--- /dev/null
+++ b/tests/testthat/test-tbl.R
@@ -0,0 +1,9 @@
+context("tbl")
+
+test_that("tbl_nongroup_vars() excludes group variables", {
+ cube <- group_by(nasa, month)
+ expect_identical(tbl_nongroup_vars(cube), setdiff(tbl_vars(cube), "month"))
+
+ gdf <- group_by(mtcars, cyl)
+ expect_identical(tbl_nongroup_vars(gdf), setdiff(tbl_vars(gdf), "cyl"))
+})
diff --git a/tests/testthat/test-top-n.R b/tests/testthat/test-top-n.R
index f1e7db2..e7450df 100644
--- a/tests/testthat/test-top-n.R
+++ b/tests/testthat/test-top-n.R
@@ -5,3 +5,14 @@ test_that("top_n returns n rows", {
top_four <- test_df %>% top_n(4, y)
expect_equal(dim(top_four), c(4, 2))
})
+
+test_that("top_n() handles missing `wt`", {
+ df <- data.frame(x = c(10, 4, 1, 6, 3, 1, 1))
+ expect_message(regexp = "Selecting by x",
+ expect_identical(top_n(df, 2)$x, c(10, 6))
+ )
+})
+
+test_that("top_n() handles calls", {
+ expect_identical(top_n(mtcars, 2, -disp), top_n(mtcars, -2, disp))
+})
diff --git a/tests/testthat/test-ts.R b/tests/testthat/test-ts.R
new file mode 100644
index 0000000..cf401e1
--- /dev/null
+++ b/tests/testthat/test-ts.R
@@ -0,0 +1,16 @@
+context("ts")
+
+test_that("filter and lag throw errors", {
+ x <- ts(1:10)
+
+ expect_error(
+ filter(x),
+ "`.data` must be a data source, not a ts object, do you want `stats::filter()`?",
+ fixed = TRUE
+ )
+ expect_error(
+ lag(x),
+ "`x` must be a vector, not a ts object, do you want `stats::lag()`?",
+ fixed = TRUE
+ )
+})
diff --git a/tests/testthat/test-underscore.R b/tests/testthat/test-underscore.R
new file mode 100644
index 0000000..f083d26
--- /dev/null
+++ b/tests/testthat/test-underscore.R
@@ -0,0 +1,412 @@
+context("underscore")
+
+df <- data_frame(
+ a = c(1:3, 2:3),
+ b = letters[c(1:4, 4L)]
+)
+
+test_that("arrange_ works", {
+ expect_equal(
+ arrange_(df, ~-a),
+ arrange(df, -a)
+ )
+
+ expect_equal(
+ arrange_(df, .dots = list(quote(-a))),
+ arrange(df, -a)
+ )
+
+ expect_equal(
+ arrange_(df, .dots = list(~-a)),
+ arrange(df, -a)
+ )
+})
+
+test_that("count_ works", {
+ expect_equal(
+ count_(df, ~b),
+ count(df, b)
+ )
+
+ expect_equal(
+ count_(df, ~b, wt = quote(a)),
+ count(df, b, wt = a)
+ )
+
+ wt <- 1:4
+ expect_identical(
+ count_(df, "b", "wt"),
+ count(df, b, wt = wt)
+ )
+
+ expect_identical(
+ add_count(df, b),
+ add_count_(df, ~b)
+ )
+})
+
+test_that("distinct_ works", {
+ expect_equal(
+ distinct_(df, ~a),
+ distinct(df, a)
+ )
+
+ expect_equal(
+ distinct_(df, .dots = list(quote(a))),
+ distinct(df, a)
+ )
+
+ expect_equal(
+ distinct_(df, .dots = list(~a)),
+ distinct(df, a)
+ )
+
+ expect_equal(
+ distinct_(df %>% group_by(b), ~a, .dots = NULL),
+ distinct(df %>% group_by(b), a)
+ )
+
+ expect_equal(
+ distinct_(df %>% group_by(b), .dots = list(quote(a))),
+ distinct(df %>% group_by(b), a)
+ )
+
+ expect_equal(
+ distinct_(df %>% group_by(b), .dots = list(~a)),
+ distinct(df %>% group_by(b), a)
+ )
+})
+
+test_that("do_ works", {
+ expect_equal(
+ do_(df, ~data_frame(-.$a)),
+ do(df, data_frame(-.$a))
+ )
+
+ expect_equal(
+ do_(df, .dots = list(quote(dplyr::data_frame(-.$a)))),
+ do(df, data_frame(-.$a))
+ )
+
+ expect_equal(
+ do_(df, .dots = list(~dplyr::data_frame(-.$a))),
+ do(df, data_frame(-.$a))
+ )
+
+ foo <- "foobar"
+ expect_identical(
+ do_(df, .dots = "tibble(foo)"),
+ do(df, tibble(foo))
+ )
+
+ expect_equal(
+ do_(df %>% group_by(b), ~data_frame(-.$a)),
+ do(df %>% group_by(b), data_frame(-.$a))
+ )
+
+ expect_equal(
+ do_(df %>% group_by(b), .dots = list(quote(dplyr::data_frame(-.$a)))),
+ do(df %>% group_by(b), data_frame(-.$a))
+ )
+
+ expect_equal(
+ do_(df %>% group_by(b), .dots = list(~dplyr::data_frame(-.$a))),
+ do(df %>% group_by(b), data_frame(-.$a))
+ )
+})
+
+test_that("filter_ works", {
+ expect_equal(
+ filter_(df, ~a > 1),
+ filter(df, a > 1)
+ )
+
+ expect_equal(
+ filter_(df, .dots = list(quote(a > 1))),
+ filter(df, a > 1)
+ )
+
+ cnd <- rep(TRUE, 5)
+ expect_identical(
+ filter_(df, .dots = "cnd"),
+ filter(df, cnd)
+ )
+})
+
+test_that("funs_ works", {
+ expect_equal(
+ funs(mean),
+ funs_(list(~mean))
+ )
+
+ expect_equal(
+ funs_(list("mean")),
+ funs_(list(`environment<-`(~mean, baseenv())))
+ )
+
+ expect_equal(
+ funs(mean(.)),
+ funs_(list(~mean(.)))
+ )
+})
+
+test_that("group_by_ works", {
+ expect_equal(
+ group_by_(df, ~a),
+ group_by(df, a)
+ )
+
+ expect_equal(
+ group_by_(df, ~-a),
+ group_by(df, -a)
+ )
+
+ expect_equal(
+ group_by_(df, .dots = "a"),
+ group_by(df, a)
+ )
+
+ expect_equal(
+ group_by_(df, .dots = list(quote(-a))),
+ group_by(df, -a)
+ )
+
+ expect_equal(
+ group_by_(df, .dots = list(~-a)),
+ group_by(df, -a)
+ )
+
+ expect_warning(
+ expect_equal(
+ group_by_(df %>% rowwise, ~a),
+ group_by(df %>% rowwise, a)
+ ),
+ "rowwise"
+ )
+
+ expect_warning(
+ expect_equal(
+ group_by_(df %>% rowwise, ~-a),
+ group_by(df %>% rowwise, -a)
+ ),
+ "rowwise"
+ )
+
+ expect_warning(
+ expect_equal(
+ group_by_(df %>% rowwise, .dots = "a"),
+ group_by(df %>% rowwise, a)
+ ),
+ "rowwise"
+ )
+
+ expect_warning(
+ expect_equal(
+ group_by_(df %>% rowwise, .dots = list(quote(-a))),
+ group_by(df %>% rowwise, -a)
+ ),
+ "rowwise"
+ )
+
+ expect_warning(
+ expect_equal(
+ group_by_(df %>% rowwise, .dots = list(~-a)),
+ group_by(df %>% rowwise, -a)
+ ),
+ "rowwise"
+ )
+})
+
+test_that("mutate_ works", {
+ expect_equal(
+ mutate_(df, c = ~-a),
+ mutate(df, c = -a)
+ )
+
+ expect_equal(
+ mutate_(df, .dots = list(c = quote(-a))),
+ mutate(df, c = -a)
+ )
+
+ expect_equal(
+ mutate_(df, .dots = list(c = ~-a)),
+ mutate(df, c = -a)
+ )
+
+ expect_identical(
+ mutate_(df, ~-a),
+ mutate(df, -a)
+ )
+
+ foo <- "foobar"
+ expect_identical(
+ mutate_(df, .dots = "foo"),
+ mutate(df, foo)
+ )
+})
+
+test_that("rename_ works", {
+ expect_equal(
+ rename_(df, c = ~a),
+ rename(df, c = a)
+ )
+
+ expect_equal(
+ rename_(df, .dots = list(c = quote(a))),
+ rename(df, c = a)
+ )
+
+ expect_equal(
+ rename_(df, .dots = list(c = ~a)),
+ rename(df, c = a)
+ )
+})
+
+test_that("select_ works", {
+ expect_equal(
+ select_(df, ~a),
+ select(df, a)
+ )
+
+ expect_equal(
+ select_(df, ~-a),
+ select(df, -a)
+ )
+
+ expect_equal(
+ select_(df, .dots = "a"),
+ select(df, a)
+ )
+
+ expect_equal(
+ select_(df, .dots = list(quote(-a))),
+ select(df, -a)
+ )
+
+ expect_equal(
+ select_(df, .dots = list(~-a)),
+ select(df, -a)
+ )
+
+ pos <- 1
+ expect_identical(
+ select_(df, c = "pos"),
+ select(df, c = pos)
+ )
+})
+
+test_that("slice_ works", {
+ expect_equal(
+ slice_(df, ~2:n()),
+ slice(df, 2:n())
+ )
+
+ expect_equal(
+ slice_(df, .dots = list(quote(2:n()))),
+ slice(df, 2:n())
+ )
+
+ expect_equal(
+ slice_(df, .dots = list(~2:n())),
+ slice(df, 2:n())
+ )
+
+ pos <- 3
+ expect_identical(
+ slice_(df, .dots = "pos:n()"),
+ slice(df, pos:n())
+ )
+})
+
+test_that("summarise_ works", {
+ expect_equal(
+ summarise_(df, ~mean(a)),
+ summarise(df, mean(a))
+ )
+
+ expect_equal(
+ summarise_(df, .dots = list(quote(mean(a)))),
+ summarise(df, mean(a))
+ )
+
+ expect_equal(
+ summarise_(df, .dots = list(~mean(a))),
+ summarise(df, mean(a))
+ )
+
+ my_mean <- mean
+ expect_identical(
+ summarise_(df, .dots = "my_mean(a)"),
+ summarise(df, my_mean(a))
+ )
+
+ expect_equal(
+ summarise_(df %>% group_by(b), ~mean(a)),
+ summarise(df %>% group_by(b), mean(a))
+ )
+
+ expect_equal(
+ summarise_(df %>% group_by(b), .dots = list(quote(mean(a)))),
+ summarise(df %>% group_by(b), mean(a))
+ )
+
+ expect_equal(
+ summarise_(df %>% group_by(b), .dots = list(~mean(a))),
+ summarise(df %>% group_by(b), mean(a))
+ )
+})
+
+test_that("summarize_ works", {
+ expect_equal(
+ summarize_(df, ~mean(a)),
+ summarize(df, mean(a))
+ )
+
+ expect_equal(
+ summarize_(df, .dots = list(quote(mean(a)))),
+ summarize(df, mean(a))
+ )
+
+ expect_equal(
+ summarize_(df, .dots = list(~mean(a))),
+ summarize(df, mean(a))
+ )
+
+ expect_equal(
+ summarize_(df %>% group_by(b), ~mean(a)),
+ summarize(df %>% group_by(b), mean(a))
+ )
+
+ expect_equal(
+ summarize_(df %>% group_by(b), .dots = list(quote(mean(a)))),
+ summarize(df %>% group_by(b), mean(a))
+ )
+
+ expect_equal(
+ summarize_(df %>% group_by(b), .dots = list(~mean(a))),
+ summarize(df %>% group_by(b), mean(a))
+ )
+})
+
+test_that("transmute_ works", {
+ expect_equal(
+ transmute_(df, c = ~-a),
+ transmute(df, c = -a)
+ )
+
+ expect_equal(
+ transmute_(df, .dots = list(c = quote(-a))),
+ transmute(df, c = -a)
+ )
+
+ expect_equal(
+ transmute_(df, .dots = list(c = ~-a)),
+ transmute(df, c = -a)
+ )
+
+ foo <- "foobar"
+ expect_identical(
+ transmute_(df, .dots = "foo"),
+ transmute(df, foo)
+ )
+})
diff --git a/tests/testthat/test-union-all.R b/tests/testthat/test-union-all.R
index e695cdd..41a04de 100644
--- a/tests/testthat/test-union-all.R
+++ b/tests/testthat/test-union-all.R
@@ -10,14 +10,3 @@ test_that("union all on data frames calls bind rows", {
expect_equal(union_all(df1, df2), bind_rows(df1, df2))
})
-
-test_that("union on database uses UNION ALL", {
- skip_if_no_sqlite()
- db <- src_sqlite(":memory:", TRUE)
-
- df1 <- copy_to(db, data_frame(x = 1:2), "df1")
- df2 <- copy_to(db, data_frame(x = 1:2), "df2")
-
- res <- collect(union_all(df1, df2))
- expect_equal(res$x, rep(1:2, 2))
-})
diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R
new file mode 100644
index 0000000..282a940
--- /dev/null
+++ b/tests/testthat/test-utils.R
@@ -0,0 +1,13 @@
+context("utils")
+
+test_that("check_pkg() gives correct error message", {
+ expect_error(
+ dplyr:::check_pkg("`__foobarbaz__`", "foobar a baz"),
+ "The `__foobarbaz__` package is required to foobar a baz")
+})
+
+test_that("get_vars() handles list of symbols as vars attribute", {
+ gdf <- group_by(tibble(g = 1:2), g)
+ gdf <- set_attrs(gdf, vars = list(sym("g")))
+ expect_identical(test_grouped_df(gdf), gdf)
+})
diff --git a/tests/testthat/test-window.R b/tests/testthat/test-window.R
index d5248e4..2d0ad95 100644
--- a/tests/testthat/test-window.R
+++ b/tests/testthat/test-window.R
@@ -14,65 +14,51 @@ test_that("If n = length(x), returns all missing", {
})
test_that("cumany handles NA (#408)", {
- batman <- c(NA,NA,NA,NA,NA)
+ batman <- c(NA, NA, NA, NA, NA)
expect_true(all(is.na(cumany(batman))))
expect_true(all(is.na(cumall(batman))))
- x <- c(FALSE,NA)
- expect_true( all( !cumall(x) ) )
+ x <- c(FALSE, NA)
+ expect_true(all(!cumall(x)))
- x <- c(TRUE,NA)
- expect_true( all( cumany(x) ) )
+ x <- c(TRUE, NA)
+ expect_true(all(cumany(x)))
})
test_that("percent_rank ignores NAs (#1132)", {
- expect_equal( percent_rank(c(1:3, NA)), c(0, 0.5, 1, NA) )
+ expect_equal(percent_rank(c(1:3, NA)), c(0, 0.5, 1, NA))
})
test_that("cume_dist ignores NAs (#1132)", {
- expect_equal( cume_dist(c(1:3, NA)), c(1/3, 2/3, 1, NA) )
+ expect_equal(cume_dist(c(1:3, NA)), c(1 / 3, 2 / 3, 1, NA))
})
-test_that( "cummean is not confused by FP error (#1387)", {
+test_that("cummean is not confused by FP error (#1387)", {
a <- rep(99, 9)
- expect_true( all( cummean(a) == a) )
+ expect_true(all(cummean(a) == a))
})
-# Databases ---------------------------------------------------------------
+test_that("order_by() returns correct value", {
+ expected <- int(15, 14, 12, 9, 5)
+ expect_identical(order_by(5:1, cumsum(1:5)), expected)
-test_that("over() only requires first argument", {
- expect_equal(over("X"), sql("'X' OVER ()"))
+ x <- 5:1; y <- 1:5
+ expect_identical(order_by(x, cumsum(y)), expected)
})
-test_that("multiple group by or order values don't have parens", {
+test_that("order_by() works in arbitrary envs (#2297)", {
+ env <- child_env("base")
expect_equal(
- over(ident("x"), order = c("x", "y")),
- sql('"x" OVER (ORDER BY "x", "y")')
+ with_env(env, dplyr::order_by(5:1, cumsum(1:5))),
+ rev(cumsum(rev(1:5)))
)
expect_equal(
- over(ident("x"), partition = c("x", "y")),
- sql('"x" OVER (PARTITION BY "x", "y")')
+ order_by(5:1, cumsum(1:5)),
+ rev(cumsum(rev(1:5)))
)
})
-test_that("connection affects quoting window function fields", {
- dbiTest <- structure(list(), class = "DBITestConnection")
- dbTest <- src_sql("test", con = dbiTest)
- testTable <- tbl_sql("test", src = dbTest, from = "table1")
-
- out <- filter(group_by(testTable, field1), min_rank(desc(field1)) < 2)
- sqlText <- sql_render(out)
-
- testthat::expect_equal(
- grep(paste(
- "^SELECT `field1`",
- "FROM \\(SELECT `field1`, rank\\(\\) OVER \\(PARTITION BY `field1` ORDER BY `field1` DESC\\) AS `[a-zA-Z0-9]+`",
- "FROM `table1`\\) `[a-zA-Z0-9]+`",
- "WHERE \\(`[a-zA-Z0-9]+` < 2.0\\)$",
- sep = "\n"
- ), sqlText),
- 1,
- info = sqlText
- )
+test_that("order_by() fails when not supplied a call (#3065)", {
+ expect_error(order_by(NULL, !! 1L), "`call` must be a function call, not an integer vector")
})
diff --git a/tests/testthat/utf-8.R b/tests/testthat/utf-8.R
index c378e10..d2bf8ca 100644
--- a/tests/testthat/utf-8.R
+++ b/tests/testthat/utf-8.R
@@ -1,17 +1,17 @@
# UTF-8 tests that can't be run on Windows CRAN
-df <- data.frame(中文1 = 1:10, 中文2 = 1:10, eng = 1:10)
+df <- data.frame(中文1 = 1:10, 中文2 = 1:10, eng = 1:10)
df2 <- df %>% mutate(中文1 = 中文1 + 1)
gdf2 <- df %>% group_by(eng) %>% mutate(中文1 = 中文1 + 1)
-expect_equal( strings_addresses(names(df)) , strings_addresses(names(df2)) )
-expect_equal( strings_addresses(names(df)) , strings_addresses(names(gdf2)) )
+expect_equal(strings_addresses(names(df)), strings_addresses(names(df2)))
+expect_equal(strings_addresses(names(df)), strings_addresses(names(gdf2)))
df3 <- filter(df2, eng > 5)
gdf3 <- filter(gdf2, eng > 5)
-expect_equal( strings_addresses(names(df)) , strings_addresses(names(df3)) )
-expect_equal( strings_addresses(names(df)) , strings_addresses(names(gdf3)) )
+expect_equal(strings_addresses(names(df)), strings_addresses(names(df3)))
+expect_equal(strings_addresses(names(df)), strings_addresses(names(gdf3)))
df4 <- filter(df2, 中文1 > 5)
gdf4 <- filter(gdf2, 中文1 > 5)
-expect_equal( strings_addresses(names(df)) , strings_addresses(names(df4)) )
-expect_equal( strings_addresses(names(df)) , strings_addresses(names(gdf4)) )
+expect_equal(strings_addresses(names(df)), strings_addresses(names(df4)))
+expect_equal(strings_addresses(names(df)), strings_addresses(names(gdf4)))
diff --git a/vignettes/compatibility.Rmd b/vignettes/compatibility.Rmd
new file mode 100644
index 0000000..76e8662
--- /dev/null
+++ b/vignettes/compatibility.Rmd
@@ -0,0 +1,288 @@
+---
+title: "dplyr compatibility"
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{dplyr compatibility}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteEncoding{UTF-8}
+---
+
+```{r setup, include = FALSE}
+library(dplyr)
+knitr::opts_chunk$set(collapse = T, comment = "#>")
+```
+
+This vignette is aimed at package authors who need to update their code because of a backward incompatible change to dplyr. We do try and minimise backward incompatible changes as much as possible, but sometimes they are necessary in order to radically simplify existing code, or unlock a lot of potential value in the future.
+
+This vignette starts with some general advice on writing package code that works with multiple version of dplyr, then continues to discuss specific changes in dplyr versions.
+
+## Working with multiple dplyr versions
+
+Ideally, you want to make sure that your package works with both the released version and the development version of dplyr. This is typically a little bit more work, but has two big advantages:
+
+1. It's more convenient for your users, since they're not forced to update
+ dplyr if they don't want to)
+
+1. It's easier on CRAN since it doesn't require a massive coordinated release
+ of multiple packages.
+
+To make code work with multiple versions of a package, your first tool is the simple if statement:
+
+```{r, results = "hide"}
+if (utils::packageVersion("dplyr") > "0.5.0") {
+ # code for new version
+} else {
+ # code for old version
+}
+```
+
+Always condition on `> current-version`, not `>= next-version` because this will ensure that this branch is also used for the development version of the package. For example, if the current release is version "0.5.0", the development version will be "0.5.0.9000".
+
+Occasionally, you'll run into a situation where the `NAMESPACE` has changed and you need to conditionally import different functions. This typically occurs when functions are moved from one package to another. We try out best to provide automatic fallbacks, but this is not always possible. Often you can work around the problem by avoiding `importFrom` and using `::` instead. Do this where possible:
+
+```{r, eval = FALSE}
+if (utils::packageVersion("dplyr") > "0.5.0") {
+ dbplyr::build_sql(...)
+} else {
+ dplyr::build_sql(...)
+}
+```
+
+This will generate an `R CMD check` NOTE (because the one of the functions will always be missing), but this is ok. Simply explain that you get the note because you have written a wrapper to make sure your code is backward compatible.
+
+Sometimes it's not possible to avoid `importFrom()`. For example you might be importing a generic so that you can define a method for it. In this case, you can take advantage of a little-known feature in the `NAMESPACE` file: you can include `if` statements.
+
+```{r}
+#' @rawNamespace
+#' if (utils::packageVersion("dplyr") > "0.5.0") {
+#' importFrom("dbplyr", "build_sql")
+#' } else {
+#' importFrom("dplyr", "build_sql")
+#' }
+```
+
+## dplyr 0.6.0
+
+### Database code moves to dbplyr
+
+Almost all database related code has been moved out of dplyr and into a new package, [dbplyr](http://github.com/hadley/dbplyr/). This makes dplyr simpler, and will make it easier to release fixes for bugs that only affect databases. If you've implemented a database backend for dplyr, please read the [backend news](https://github.com/hadley/dbplyr/blob/master/NEWS.md#backends) on the backend.
+
+Depending on what generics you use, and what generics you provide methods for you, you may need to write some conditional code. To help make this easier we've written `wrap_dbplyr_obj()` which will write the helper code for you:
+
+```{r, eval = FALSE}
+wrap_dbplyr_obj("build_sql")
+
+wrap_dbplyr_obj("base_agg")
+```
+
+Simply copy the results of this function in your package.
+
+These will generate `R CMD check` NOTES, so make sure to tell CRAN that this is to ensure backward compatibility.
+
+
+### Deprecation of underscored `verbs_()`
+
+Because the tidyeval framework allows us to combine SE and NSE
+semantics within the same functions, the underscored verbs have been
+softly deprecated.
+
+
+#### For users of SE_ verbs
+
+The legacy underscored versions take objects for which a
+`lazyeval::as.lazy()` method is defined. This includes symbols and
+calls, strings, and formulas. All of these objects have been replaced
+with quosures and you can call tidyeval verbs with unquoted quosures:
+
+```{r, eval = FALSE}
+quo <- quo(cyl)
+select(mtcars, !! quo)
+```
+
+Symbolic expressions are also supported, but note that bare symbols
+and calls do not carry scope information. If you're referring to
+objects in the data frame, it's safe to omit specifying an enclosure:
+
+```{r, results = "hide"}
+sym <- quote(cyl)
+select(mtcars, !! sym)
+
+call <- quote(mean(cyl))
+summarise(mtcars, !! call)
+```
+
+Transforming objects into quosures is generally straightforward. To
+enclose with the current environment, you can unquote directly in
+`quo()` or you can use `as_quosure()`:
+
+```{r}
+quo(!! sym)
+quo(!! call)
+
+rlang::as_quosure(sym)
+rlang::as_quosure(call)
+```
+
+Note that while formulas and quosures are very similar objects (and in
+the most general sense, formulas are quosures), they can't be used
+interchangeably in tidyeval functions. Early implementations did treat
+bare formulas as quosures, but this created compatibility issues with
+modelling functions of the stats package. Fortunately, it's easy to
+transform formulas to quosures that will self-evaluate in tidyeval
+functions:
+
+```{r}
+f <- ~cyl
+f
+rlang::as_quosure(f)
+```
+
+Finally, and perhaps most importantly, **strings are not and should
+not be parsed**. As developers, it is tempting to try and solve
+problems using strings because we have been trained to work with
+strings rather than quoted expressions. However it's almost always the
+wrong way to approach the problem. The exception is for creating
+symbols. In that case it is perfectly legitimate to use strings:
+
+```{r}
+rlang::sym("cyl")
+rlang::syms(letters[1:3])
+```
+
+But you should never use strings to create calls. Instead you can use
+quasiquotation:
+
+```{r}
+syms <- rlang::syms(c("foo", "bar", "baz"))
+quo(my_call(!!! syms))
+
+fun <- rlang::sym("my_call")
+quo(UQ(fun)(!!! syms))
+```
+
+Or create the call with `lang()`:
+
+```{r}
+call <- rlang::lang("my_call", !!! syms)
+call
+
+rlang::as_quosure(call)
+
+# Or equivalently:
+quo(!! rlang::lang("my_call", !!! syms))
+```
+
+Note that idioms based on `interp()` should now generally be avoided
+and replaced with quasiquotation. Where you used to interpolate:
+
+```{r, eval=FALSE}
+lazyeval::interp(~ mean(var), var = rlang::sym("mpg"))
+```
+
+You would now unquote:
+
+```{r, eval=FALSE}
+var <- "mpg"
+quo(mean(!! rlang::sym(var)))
+```
+
+See also `vignette("programming")` for more about quasiquotation and
+quosures.
+
+
+#### For package authors
+
+For package authors, rlang provides a
+[compatibility file](https://github.com/hadley/rlang/blob/master/R/compat-lazyeval.R) that
+you can copy to your package. `compat_lazy()` and `compat_lazy_dots()`
+turn lazy-able objects into proper quosures. This helps providing an
+underscored version to your users for backward compatibility. For
+instance, here is how we defined the underscored version of `filter()`
+in dplyr 0.6:
+
+```{r, eval = FALSE}
+filter_.tbl_df <- function(.data, ..., .dots = list()) {
+ dots <- compat_lazy_dots(.dots, caller_env(), ...)
+ filter(.data, !!! dots)
+}
+```
+
+With tidyeval, S3 dispatch to the correct method might be an issue. In
+the past, the genericity of dplyr verbs was accomplished by
+dispatching in the underscored versions. Now that those are
+deprecated, we've turned the non-underscored verbs into S3 generics.
+
+We maintain backward compatibility by redispatching to old underscored
+verbs in the default methods of the new S3 generics. For example, here
+is how we redispatch `filter()`:
+
+```{r, eval = FALSE}
+filter.default <- function(.data, ...) {
+ filter_(.data, .dots = compat_as_lazy_dots(...))
+}
+```
+
+This gets the job done in most cases. However, the default method will
+not be called for objects inheriting from one of the classes for which
+we provide non-underscored methods: `data.frame`, `tbl_df`, `tbl_cube`
+and `grouped_df`. An example of this is the `sf` package whose objects
+have classes `c("sf", "data.frame")`. Authors of such packages should
+provide a method for the non-underscored generic in order to be
+compatible with dplyr:
+
+```{r, eval = FALSE}
+filter.sf <- function(.data, ...) {
+ st_as_sf(NextMethod())
+}
+```
+
+If you need help with this, please let us know!
+
+
+### Deprecation of `mutate_each()` and `summarise_each()`
+
+These functions have been replaced by a more complete family of
+functions. This family has suffixes `_if`, `_at` and `_all` and
+includes more verbs than just `mutate` `summarise`.
+
+If you need to update your code to the new family, there are two
+relevant functions depending on which variables you apply `funs()` to.
+If you called `mutate_each()` without supplying a selection of
+variables, `funs` is applied to all variables. In this case, you
+should update your code to use `mutate_all()` instead:
+
+```{r, eval = FALSE}
+mutate_each(starwars, funs(as.character))
+mutate_all(starwars, funs(as.character))
+```
+
+Note that the new verbs support bare functions as well, so you don't
+necessarily need to wrap with `funs()`:
+
+```{r, eval = FALSE}
+mutate_all(starwars, as.character)
+```
+
+
+On the other hand, if you supplied a variable selection, you should
+use `mutate_at()`. The variable selection should be wrapped with
+`vars()`.
+
+```{r, eval = FALSE}
+mutate_each(starwars, funs(as.character), height, mass)
+mutate_at(starwars, vars(height, mass), as.character)
+```
+
+`vars()` supports all the selection helpers that you usually use with
+`select()`:
+
+```{r, eval = FALSE}
+summarise_at(mtcars, vars(starts_with("d")), mean)
+```
+
+Note that intead of a `vars()` selection, you can also supply
+character vectors of column names:
+
+```{r, eval = FALSE}
+mutate_at(starwars, c("height", "mass"), as.character)
+```
diff --git a/vignettes/data_frames.Rmd b/vignettes/data_frames.Rmd
deleted file mode 100644
index a5303c5..0000000
--- a/vignettes/data_frames.Rmd
+++ /dev/null
@@ -1,79 +0,0 @@
----
-title: "Data frame performance"
-date: "`r Sys.Date()`"
-output: rmarkdown::html_vignette
-vignette: >
- %\VignetteIndexEntry{Data frame performance}
- %\VignetteEngine{knitr::rmarkdown}
- \usepackage[utf8]{inputenc}
----
-
-```{r, echo = FALSE, message = FALSE}
-knitr::opts_chunk$set(collapse = T, comment = "#>")
-options(tibble.print_min = 4L, tibble.print_max = 4L)
-library(dplyr)
-```
-
-One of the reasons that dplyr is fast is that it's very careful about when to make copies. This section describes how this works, and gives you some useful tools for understanding the memory usage of data frames in R.
-
-The first tool we'll use is `dplyr::location()`. It tells us the memory location of three components of a data frame object:
-
-* the data frame itself
-* each column
-* each attribute
-
-```{r}
-location(iris)
-```
-
-It's useful to know the memory address, because if the address changes, then you'll know that R has made a copy. Copies are bad because they take time to create. This isn't usually a bottleneck if you have a few thousand values, but if you have millions or tens of millions of values it starts to take significant amounts of time. Unnecessary copies are also bad because they take up memory.
-
-R tries to avoid making copies where possible. For example, if you just assign `iris` to another variable, it continues to the point same location:
-
-```{r}
-iris2 <- iris
-location(iris2)
-```
-
-Rather than having to compare hard to read memory locations, we can instead use the `dplyr::changes()` function to highlights changes between two versions of a data frame. The code below shows us that `iris` and `iris2` are identical: both names point to the same location in memory.
-
-```{r}
-changes(iris2, iris)
-```
-
-What do you think happens if you modify a single column of `iris2`? In R 3.1.0 and above, R knows to modify only that one column and to leave the others pointing to their existing locations:
-
-```{r}
-iris2$Sepal.Length <- iris2$Sepal.Length * 2
-changes(iris, iris2)
-```
-
-(This was not the case prior to version 3.1.0, where R created a deep copy of the entire data frame.)
-
-dplyr is equally smart:
-
-```{r}
-iris3 <- mutate(iris, Sepal.Length = Sepal.Length * 2)
-changes(iris3, iris)
-```
-
-It creates only one new column while all the other columns continue to point at their original locations. You might notice that the attributes are still copied. However, this has little impact on performance. Because attributes are usually short vectors, the internal dplyr code needed to copy them is also considerably simpler.
-
-dplyr never makes copies unless it has to:
-
-* `tbl_df()` and `group_by()` don't copy columns
-
-* `select()` never copies columns, even when you rename them
-
-* `mutate()` never copies columns, except when you modify an existing column
-
-* `arrange()` must always copy all columns because you're changing the order of every one.
- This is an expensive operation for big data, but you can generally avoid
- it using the order argument to [window functions](window-functions.html)
-
-* `summarise()` creates new data, but it's usually at least an order of
- magnitude smaller than the original data.
-
-In short, dplyr lets you work with data frames with very little memory overhead.
-
-data.table takes this idea one step further: it provides functions that modify a data table in place. This avoids the need to make copies of pointers to existing columns and attributes, and speeds up operations when you have many columns. dplyr doesn't do this with data frames (although it could) because I think it's safer to keep data immutable: even if the resulting data frame shares practically all the data of the original data frame, all dplyr data frame methods return a new data frame.
diff --git a/inst/doc/data_frames.html b/vignettes/data_frames.html
similarity index 85%
rename from inst/doc/data_frames.html
rename to vignettes/data_frames.html
index c36431d..33f2f83 100644
--- a/inst/doc/data_frames.html
+++ b/vignettes/data_frames.html
@@ -11,7 +11,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1">
-<meta name="date" content="2016-06-23" />
+<meta name="date" content="2016-10-27" />
<title>Data frame performance</title>
@@ -68,7 +68,7 @@ code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Inf
<h1 class="title toc-ignore">Data frame performance</h1>
-<h4 class="date"><em>2016-06-23</em></h4>
+<h4 class="date"><em>2016-10-27</em></h4>
@@ -80,32 +80,32 @@ code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Inf
<li>each attribute</li>
</ul>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">location</span>(iris)
-<span class="co">#> <0x7fdc68e309a8></span>
+<span class="co">#> <0xd3c1448></span>
<span class="co">#> Variables:</span>
-<span class="co">#> * Sepal.Length: <0x7fdc68f06200></span>
-<span class="co">#> * Sepal.Width: <0x7fdc68f25000></span>
-<span class="co">#> * Petal.Length: <0x7fdc68f25600></span>
-<span class="co">#> * Petal.Width: <0x7fdc68f25c00></span>
-<span class="co">#> * Species: <0x7fdc6843e9e0></span>
+<span class="co">#> * Sepal.Length: <0x9bafe90></span>
+<span class="co">#> * Sepal.Width: <0xcaff400></span>
+<span class="co">#> * Petal.Length: <0xa6479a0></span>
+<span class="co">#> * Petal.Width: <0xa77fdf0></span>
+<span class="co">#> * Species: <0xc56c240></span>
<span class="co">#> Attributes:</span>
-<span class="co">#> * names: <0x7fdc68e30940></span>
-<span class="co">#> * row.names: <0x7fdc6843fa00></span>
-<span class="co">#> * class: <0x7fdc688bbb48></span></code></pre></div>
+<span class="co">#> * names: <0xd3c1580></span>
+<span class="co">#> * row.names: <0xb82d160></span>
+<span class="co">#> * class: <0xd166cb8></span></code></pre></div>
<p>It’s useful to know the memory address, because if the address changes, then you’ll know that R has made a copy. Copies are bad because they take time to create. This isn’t usually a bottleneck if you have a few thousand values, but if you have millions or tens of millions of values it starts to take significant amounts of time. Unnecessary copies are also bad because they take up memory.</p>
<p>R tries to avoid making copies where possible. For example, if you just assign <code>iris</code> to another variable, it continues to the point same location:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">iris2 <-<span class="st"> </span>iris
<span class="kw">location</span>(iris2)
-<span class="co">#> <0x7fdc68e309a8></span>
+<span class="co">#> <0xd3c1448></span>
<span class="co">#> Variables:</span>
-<span class="co">#> * Sepal.Length: <0x7fdc68f06200></span>
-<span class="co">#> * Sepal.Width: <0x7fdc68f25000></span>
-<span class="co">#> * Petal.Length: <0x7fdc68f25600></span>
-<span class="co">#> * Petal.Width: <0x7fdc68f25c00></span>
-<span class="co">#> * Species: <0x7fdc6843e9e0></span>
+<span class="co">#> * Sepal.Length: <0x9bafe90></span>
+<span class="co">#> * Sepal.Width: <0xcaff400></span>
+<span class="co">#> * Petal.Length: <0xa6479a0></span>
+<span class="co">#> * Petal.Width: <0xa77fdf0></span>
+<span class="co">#> * Species: <0xc56c240></span>
<span class="co">#> Attributes:</span>
-<span class="co">#> * names: <0x7fdc68e30940></span>
-<span class="co">#> * row.names: <0x7fdc6843c9e0></span>
-<span class="co">#> * class: <0x7fdc688bbb48></span></code></pre></div>
+<span class="co">#> * names: <0xd3c1580></span>
+<span class="co">#> * row.names: <0xbac73a0></span>
+<span class="co">#> * class: <0xd166cb8></span></code></pre></div>
<p>Rather than having to compare hard to read memory locations, we can instead use the <code>dplyr::changes()</code> function to highlights changes between two versions of a data frame. The code below shows us that <code>iris</code> and <code>iris2</code> are identical: both names point to the same location in memory.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">changes</span>(iris2, iris)
<span class="co">#> <identical></span></code></pre></div>
@@ -113,25 +113,25 @@ code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Inf
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">iris2$Sepal.Length <-<span class="st"> </span>iris2$Sepal.Length *<span class="st"> </span><span class="dv">2</span>
<span class="kw">changes</span>(iris, iris2)
<span class="co">#> Changed variables:</span>
-<span class="co">#> old new </span>
-<span class="co">#> Sepal.Length 0x7fdc68f06200 0x7fdc68fda800</span>
+<span class="co">#> old new </span>
+<span class="co">#> Sepal.Length 0x9bafe90 0xcbfb240</span>
<span class="co">#> </span>
<span class="co">#> Changed attributes:</span>
-<span class="co">#> old new </span>
-<span class="co">#> row.names 0x7fdc6ba02760 0x7fdc6ba029e0</span></code></pre></div>
+<span class="co">#> old new </span>
+<span class="co">#> row.names 0xb889ed0 0xb889350</span></code></pre></div>
<p>(This was not the case prior to version 3.1.0, where R created a deep copy of the entire data frame.)</p>
<p>dplyr is equally smart:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">iris3 <-<span class="st"> </span><span class="kw">mutate</span>(iris, <span class="dt">Sepal.Length =</span> Sepal.Length *<span class="st"> </span><span class="dv">2</span>)
<span class="kw">changes</span>(iris3, iris)
<span class="co">#> Changed variables:</span>
-<span class="co">#> old new </span>
-<span class="co">#> Sepal.Length 0x7fdc6952e000 0x7fdc68f06200</span>
+<span class="co">#> old new </span>
+<span class="co">#> Sepal.Length 0xe2bbe90 0x9bafe90</span>
<span class="co">#> </span>
<span class="co">#> Changed attributes:</span>
-<span class="co">#> old new </span>
-<span class="co">#> class 0x7fdc6ab8ef58 0x7fdc688bbb48</span>
-<span class="co">#> names 0x7fdc699bed48 0x7fdc68e30940</span>
-<span class="co">#> row.names 0x7fdc6840c2d0 0x7fdc68427560</span></code></pre></div>
+<span class="co">#> old new </span>
+<span class="co">#> class 0xdfbdf88 0xd166cb8</span>
+<span class="co">#> names 0xd13f858 0xd3c1580</span>
+<span class="co">#> row.names 0xc9ff590 0x8934200</span></code></pre></div>
<p>It creates only one new column while all the other columns continue to point at their original locations. You might notice that the attributes are still copied. However, this has little impact on performance. Because attributes are usually short vectors, the internal dplyr code needed to copy them is also considerably simpler.</p>
<p>dplyr never makes copies unless it has to:</p>
<ul>
diff --git a/vignettes/databases.Rmd b/vignettes/databases.Rmd
deleted file mode 100644
index 12cd4ba..0000000
--- a/vignettes/databases.Rmd
+++ /dev/null
@@ -1,257 +0,0 @@
----
-title: "Databases"
-date: "`r Sys.Date()`"
-output: rmarkdown::html_vignette
-vignette: >
- %\VignetteIndexEntry{Databases}
- %\VignetteEngine{knitr::rmarkdown}
- \usepackage[utf8]{inputenc}
----
-
-```{r, echo = FALSE, message = FALSE}
-knitr::opts_chunk$set(collapse = T, comment = "#>")
-options(tibble.print_min = 4L, tibble.print_max = 4L)
-library(dplyr)
-```
-
-As well as working with local in-memory data like data frames and data tables, dplyr also works with remote on-disk data stored in databases. Generally, if your data fits in memory there is no advantage to putting it in a database: it will only be slower and more hassle. The reason you'd want to use dplyr with a database is because either your data is already in a database (and you don't want to work with static csv files that someone else has dumped out for you), or you have so much dat [...]
-
-Since R almost exclusively works with in-memory data, if you do have a lot of data in a database, you can't just dump it into R. Instead, you'll have to work with subsets or aggregates. dplyr aims to make this task as easy as possible. If you're working with large data, it's also likely that you'll need support to get the data into the database and to ensure you have the right indices for good performance. While dplyr provides some simple tools to help with these tasks, they are no subst [...]
-
-The motivation for supporting databases in dplyr is that you never pull down the right subset or aggregate from the database on your first try. Usually you have to iterate between R and SQL many times before you get the perfect dataset. But because switching between languages is cognitively challenging (especially because R and SQL are so perilously similar), dplyr helps you by allowing you to write R code that is automatically translated to SQL. The goal of dplyr is not to replace every [...]
-
-To get the most out of this chapter, you'll need to be familiar with querying SQL databases using the `SELECT` statement. If you have some familiarity with SQL and you'd like to learn more, I found [how indexes work in SQLite](http://www.sqlite.org/queryplanner.html) and [10 easy steps to a complete understanding of SQL](http://blog.jooq.org/2016/03/17/10-easy-steps-to-a-complete-understanding-of-sql) to be particularly helpful.
-
-## Getting started
-
-The easiest way to experiement with databases using dplyr is to use SQLite. This is because everything you need is already included in the R package. You won't need to install anything, and you won't need to deal with the hassle of setting up a database server. Doing so is really easy: just give the path and the ok to create a table.
-
-```{r, eval = FALSE}
-my_db <- src_sqlite("my_db.sqlite3", create = T)
-```
-
-The main new concept here is the `src`, which is a collection of types of database tables. Use `src_sqlite()`, `src_mysql()`, `src_postgres()` and `src_bigquery()` to connect to the specific types supported by dplyr.
-
-`my_db` currently has no data in it, so we'll import the `flights` data using the convenient `copy_to()` function. This is a quick and dirty way of getting data into a database. Because all the data has to flow through R, you should note that this is not suitable for very large datasets.
-
-```{r, eval = FALSE}
-library(nycflights13)
-flights_sqlite <- copy_to(my_db, flights, temporary = FALSE, indexes = list(
- c("year", "month", "day"), "carrier", "tailnum"))
-```
-
-As you can see, the `copy_to()` operation has an additional argument that allows you to supply indexes for the table. Here we set up indexes that will allow us to quickly process the data by day, by carrier and by plane. `copy_to()` also executes the SQL `ANALYZE` command: this ensures that the database has up-to-date table statistics and performs the appropriate query optimisations.
-
-For this particular dataset, there's a built-in `src` that will cache `flights` in a standard location:
-
-```{r}
-flights_sqlite <- tbl(nycflights13_sqlite(), "flights")
-flights_sqlite
-```
-
-You can also use arbitrary SQL:
-
-```{r, eval = FALSE}
-tbl(my_db, sql("SELECT * FROM flights"))
-```
-
-## Basic verbs
-
-Whether you're dealing with remote or local data sources, you use the same five verbs:
-
-```{r}
-select(flights_sqlite, year:day, dep_delay, arr_delay)
-filter(flights_sqlite, dep_delay > 240)
-arrange(flights_sqlite, year, month, day)
-mutate(flights_sqlite, speed = air_time / distance)
-summarise(flights_sqlite, delay = mean(dep_time))
-```
-
-The most important difference is that the expressions in `select()`, `filter()`, `arrange()`, `mutate()`, and `summarise()` are translated into SQL so they can be run on the database. While the translations for the most common operations are almost perfect there are some limitations, which you'll learn about later.
-
-## Laziness
-
-When working with databases, dplyr tries to be as lazy as possible:
-
-* It never pulls data into R unless you explicitly ask for it.
-
-* It delays doing any work until the last possible moment: it collects together
- everything you want to do and then sends it to the database in one step.
-
-For example, take the following code:
-
-```{r}
-c1 <- filter(flights_sqlite, year == 2013, month == 1, day == 1)
-c2 <- select(c1, year, month, day, carrier, dep_delay, air_time, distance)
-c3 <- mutate(c2, speed = distance / air_time * 60)
-c4 <- arrange(c3, year, month, day, carrier)
-```
-
-Suprisingly, this sequence of operations never actually touches the database. It's not until you ask for the data (e.g. by printing `c4`) that dplyr generates the SQL and requests the results from the database. Even then it only pulls down 10 rows.
-
-```{r}
-c4
-```
-
-To pull down all the results use `collect()`, which returns a `tbl_df()`:
-
-```{r}
-collect(c4)
-```
-
-You can see the query dplyr has generated by looking at the `query` component of the object:
-
-```{r}
-c4$query
-```
-
-You can also ask the database how it plans to execute the query with `explain()`. The output for SQLite is described in more detail on the [SQLite website](http://www.sqlite.org/eqp.html). It's helpful if you're trying to figure out which indexes are being used.
-
-```{r}
-explain(c4)
-```
-
-### Forcing computation
-
-There are three ways to force the computation of a query:
-
-* `collect()` executes the query and returns the results to R.
-
-* `compute()` executes the query and stores the results in a temporary table
- in the database.
-
-* `collapse()` turns the query into a table expression.
-
-`collect()` is the function you'll use most. Once you reach the set of operations you want, you use collect() to pull the data into a local `tbl_df()`. If you know SQL, you can use `compute()` and `collapse()` to optimise performance.
-
-### Performance considerations
-
-dplyr tries to prevent you from accidentally performing expensive query operations:
-
-* Because there's generally no way to determine how many rows a query will return unless
- you actually run it, `nrow()` is always `NA`.
-
-* Printing a tbl only runs the query for the first 10 rows.
-
-* Because you can't find the last few rows without executing the whole query, you can't use `tail()`.
-
-## SQL translation
-
-When performing the simple mathematical operations used when filtering, mutating or summarising, translating R code to SQL (or indeed to any programming language) is relatively straightforward.
-
-To experiment with this, use `translate_sql()`. The following examples work through some of the basic differences between R and SQL.
-
-```{r}
-# In SQLite variable names are escaped by double quotes:
-translate_sql(x)
-# And strings are escaped by single quotes
-translate_sql("x")
-
-# Many functions have slightly different names
-translate_sql(x == 1 && (y < 2 || z > 3))
-translate_sql(x ^ 2 < 10)
-translate_sql(x %% 2 == 10)
-
-# R and SQL have different defaults for integers and reals.
-# In R, 1 is a real, and 1L is an integer
-# In SQL, 1 is an integer, and 1.0 is a real
-translate_sql(1)
-translate_sql(1L)
-```
-
-dplyr knows how to convert the following R functions to SQL:
-
-* basic math operators: `+`, `-`, `*`, `/`, `%%`, `^`
-* math functions: `abs`, `acos`, `acosh`, `asin`, `asinh`, `atan`, `atan2`,
- `atanh`, `ceiling`, `cos`, `cosh`, `cot`, `coth`, `exp`, `floor`,
- `log`, `log10`, `round`, `sign`, `sin`, `sinh`, `sqrt`, `tan`, `tanh`
-* logical comparisons: `<`, `<=`, `!=`, `>=`, `>`, `==`, `%in%`
-* boolean operations: `&`, `&&`, `|`, `||`, `!`, `xor`
-* basic aggregations: `mean`, `sum`, `min`, `max`, `sd`, `var`
-
-The basic techniques that underlie the implementation of `translate_sql()` are described in the [Advanced R book](http://adv-r.had.co.nz/dsl.html). `translate_sql()` is built on top of R's parsing engine and has been carefully designed to generate correct SQL. It also protects you against SQL injection attacks by correctly escaping the strings and variable names needed by the database that you're connecting to.
-
-Perfect translation is not possible because databases don't have all the functions that R does. The goal of dplyr is to provide a semantic rather than a literal translation: what you mean rather than what is done. In fact, even for functions that exist both in databases and R, you shouldn't expect results to be identical; database programmers have different priorities than R core programmers. For example, in R in order to get a higher level of numerical accuracy, `mean()` loops through t [...]
-
-```{r, eval = FALSE}
-translate_sql(mean(x, trim = T))
-# Error: Invalid number of args to SQL AVG. Expecting 1
-```
-
-Any function that dplyr doesn't know how to convert is left as is. This means that database functions that are not covered by dplyr can be used directly via `translate_sql()`. Here a couple of examples that will work with [SQLite](http://www.sqlite.org/lang_corefunc.html):
-
-```{r}
-translate_sql(glob(x, y))
-translate_sql(x %like% "ab*")
-```
-
-## Grouping
-
-SQLite lacks the window functions that are needed for grouped mutation and filtering. This means that the only really useful operations for grouped SQLite tables are found in `summarise()`. The grouped summarise from the introduction translates well - the only difference is that databases always drop NULLs (their equivalent of missing values), so we don't supply `na.rm = TRUE`.
-
-```{r}
-by_tailnum <- group_by(flights_sqlite, tailnum)
-delay <- summarise(by_tailnum,
- count = n(),
- dist = mean(distance),
- delay = mean(arr_delay)
-)
-delay <- filter(delay, count > 20, dist < 2000)
-delay_local <- collect(delay)
-```
-
-Other databases do support window functions. You can learn about them in the corresponding vignette. It's sometimes possible to simulate grouped filtering and mutation using self joins, which join the original table with a summarised version, but that topic is beyond the scope of this introduction.
-
-## Other databases
-
-Aside from SQLite, the overall workflow is essentially the same regardless of the database you're connecting to. The following sections go in to more details about the peculiarities of each database engine. All of these databases follow a client-server model - a computer that connects to the database and the computer that is running the database (the two may be one and the same but usually isn't). Getting one of these databases up and running is beyond the scope of this article, but ther [...]
-
-### PostgreSQL
-
-`src_postgres()` has five arguments: `dbname`, `host`, `port`, `user` and `password`. If you are running a local postgresql database with the default settings you only need `dbname`. But in most cases, you'll need all five. dplyr uses the RPostgreSQL package to connect to postgres databases. This means that you can't currently connect to remote databases that require a SSL connection (e.g. Heroku).
-
-For example, the following code allows me to connect to a local PostgreSQL database that contains a copy of the `flights` data:
-
-```{r, eval = FALSE}
-flights_postgres <- tbl(src_postgres("nycflights13"), "flights")
-```
-
-PostgreSQL is a considerably more powerful database than SQLite. It has:
-
-* a much wider range of [built-in functions](http://www.postgresql.org/docs/9.3/static/functions.html)
-
-* support for [window functions](http://www.postgresql.org/docs/9.3/static/tutorial-window.html), which allow grouped subset and mutates to work.
-
-The following examples show how we can perform grouped filter and mutate operations with PostgreSQL. Because you can't filter on window functions directly, the SQL generated from the grouped filter is quite complex; so they instead have to go in a subquery.
-
-```{r, eval = FALSE}
-daily <- group_by(flights_postgres, year, month, day)
-
-# Find the most and least delayed flight each day
-bestworst <- daily %>%
- select(flight, arr_delay) %>%
- filter(arr_delay == min(arr_delay) || arr_delay == max(arr_delay))
-bestworst$query
-
-# Rank each flight within a daily
-ranked <- daily %>%
- select(arr_delay) %>%
- mutate(rank = rank(desc(arr_delay)))
-ranked$query
-```
-
-### MySQL and MariaDB
-
-You can connect to MySQL and MariaDB (a recent fork of MySQL) using `src_mysql()`, mediated by the [RMySQL](https://github.com/jeffreyhorner/RMySQL) package. Like PostgreSQL, you'll need to provide a `dbname`, `username`, `password`, `host`, and `port`.
-
-In terms of functionality, MySQL lies somewhere between SQLite and PostgreSQL. It provides a wider range of [built-in functions](http://dev.mysql.com/doc/refman/5.0/en/functions.html), but it does not support window functions (so you can't do grouped mutates and filters).
-
-### BigQuery
-
-BigQuery is a hosted database server provided by Google. To connect, you need to provide your `project`, `dataset` and optionally a project for `billing` (if billing for `project` isn't enabled). After you create the src, your web browser will open and ask you to authenticate. Your credentials are stored in a local cache, so you should only need to do this once.
-
-BigQuery supports only one SQL statement: [SELECT](https://developers.google.com/bigquery/query-reference). Fortunately this is all you need for data analysis. Within SELECT, BigQuery provides comprehensive coverage at a similar level to PostgreSQL.
-
-## Picking a database
-
-If you don't already have a database, here's some advice from my experiences setting up and running all of them. SQLite is by far the easiest to get started with, but the lack of window functions makes it limited for data analysis. PostgreSQL is not too much harder to use and has a wide range of built-in functions. Don't bother with MySQL/MariaDB: it's a pain to set up and the documentation is subpar. Google BigQuery might be a good fit if you have very large data, or if you're willing t [...]
diff --git a/inst/doc/databases.html b/vignettes/databases.html
similarity index 74%
rename from inst/doc/databases.html
rename to vignettes/databases.html
index 9e04464..c9c4508 100644
--- a/inst/doc/databases.html
+++ b/vignettes/databases.html
@@ -11,7 +11,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1">
-<meta name="date" content="2016-06-23" />
+<meta name="date" content="2016-10-27" />
<title>Databases</title>
@@ -68,7 +68,7 @@ code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Inf
<h1 class="title toc-ignore">Databases</h1>
-<h4 class="date"><em>2016-06-23</em></h4>
+<h4 class="date"><em>2016-10-27</em></h4>
@@ -88,20 +88,26 @@ flights_sqlite <-<span class="st"> </span><span class="kw">copy_to</span>(my_
<p>As you can see, the <code>copy_to()</code> operation has an additional argument that allows you to supply indexes for the table. Here we set up indexes that will allow us to quickly process the data by day, by carrier and by plane. <code>copy_to()</code> also executes the SQL <code>ANALYZE</code> command: this ensures that the database has up-to-date table statistics and performs the appropriate query optimisations.</p>
<p>For this particular dataset, there’s a built-in <code>src</code> that will cache <code>flights</code> in a standard location:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights_sqlite <-<span class="st"> </span><span class="kw">tbl</span>(<span class="kw">nycflights13_sqlite</span>(), <span class="st">"flights"</span>)
-<span class="co">#> Caching nycflights db at /private/tmp/RtmpduTf9z/Rinst1430371d1558/dplyr/db/nycflights13.sqlite</span>
+<span class="co">#> Caching nycflights db at /home/muelleki/R/x86_64-pc-linux-gnu-library/3.3/dplyr/db/nycflights13.sqlite</span>
+<span class="co">#> Creating table: airlines</span>
+<span class="co">#> Creating table: airports</span>
+<span class="co">#> Creating table: flights</span>
+<span class="co">#> Creating table: planes</span>
+<span class="co">#> Creating table: weather</span>
flights_sqlite
-<span class="co">#> Source: query [?? x 16]</span>
-<span class="co">#> Database: sqlite 3.8.6 [/private/tmp/RtmpduTf9z/Rinst1430371d1558/dplyr/db/nycflights13.sqlite]</span>
+<span class="co">#> Source: query [?? x 19]</span>
+<span class="co">#> Database: sqlite 3.11.1 [/home/muelleki/R/x86_64-pc-linux-gnu-library/3.3/dplyr/db/nycflights13.sqlite]</span>
<span class="co">#> </span>
-<span class="co">#> # S3: tbl_sqlite</span>
-<span class="co">#> year month day dep_time dep_delay arr_time arr_delay carrier tailnum</span>
-<span class="co">#> <int> <int> <int> <int> <dbl> <int> <dbl> <chr> <chr></span>
-<span class="co">#> 1 2013 1 1 517 2 830 11 UA N14228</span>
-<span class="co">#> 2 2013 1 1 533 4 850 20 UA N24211</span>
-<span class="co">#> 3 2013 1 1 542 2 923 33 AA N619AA</span>
-<span class="co">#> 4 2013 1 1 544 -1 1004 -18 B6 N804JB</span>
-<span class="co">#> ... with more rows, and 7 more variables: flight <int>, origin <chr>,</span>
-<span class="co">#> dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl></span></code></pre></div>
+<span class="co">#> year month day dep_time sched_dep_time dep_delay arr_time</span>
+<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int></span>
+<span class="co">#> 1 2013 1 1 517 515 2 830</span>
+<span class="co">#> 2 2013 1 1 533 529 4 850</span>
+<span class="co">#> 3 2013 1 1 542 540 2 923</span>
+<span class="co">#> 4 2013 1 1 544 545 -1 1004</span>
+<span class="co">#> # ... with more rows, and 12 more variables: sched_arr_time <int>,</span>
+<span class="co">#> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,</span>
+<span class="co">#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,</span>
+<span class="co">#> # minute <dbl>, time_hour <dbl></span></code></pre></div>
<p>You can also use arbitrary SQL:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">tbl</span>(my_db, <span class="kw">sql</span>(<span class="st">"SELECT * FROM flights"</span>))</code></pre></div>
</div>
@@ -110,61 +116,61 @@ flights_sqlite
<p>Whether you’re dealing with remote or local data sources, you use the same five verbs:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">select</span>(flights_sqlite, year:day, dep_delay, arr_delay)
<span class="co">#> Source: query [?? x 5]</span>
-<span class="co">#> Database: sqlite 3.8.6 [/private/tmp/RtmpduTf9z/Rinst1430371d1558/dplyr/db/nycflights13.sqlite]</span>
+<span class="co">#> Database: sqlite 3.11.1 [/home/muelleki/R/x86_64-pc-linux-gnu-library/3.3/dplyr/db/nycflights13.sqlite]</span>
<span class="co">#> </span>
-<span class="co">#> # S3: tbl_sqlite</span>
<span class="co">#> year month day dep_delay arr_delay</span>
<span class="co">#> <int> <int> <int> <dbl> <dbl></span>
<span class="co">#> 1 2013 1 1 2 11</span>
<span class="co">#> 2 2013 1 1 4 20</span>
<span class="co">#> 3 2013 1 1 2 33</span>
<span class="co">#> 4 2013 1 1 -1 -18</span>
-<span class="co">#> ... with more rows</span>
+<span class="co">#> # ... with more rows</span>
<span class="kw">filter</span>(flights_sqlite, dep_delay ><span class="st"> </span><span class="dv">240</span>)
-<span class="co">#> Source: query [?? x 16]</span>
-<span class="co">#> Database: sqlite 3.8.6 [/private/tmp/RtmpduTf9z/Rinst1430371d1558/dplyr/db/nycflights13.sqlite]</span>
+<span class="co">#> Source: query [?? x 19]</span>
+<span class="co">#> Database: sqlite 3.11.1 [/home/muelleki/R/x86_64-pc-linux-gnu-library/3.3/dplyr/db/nycflights13.sqlite]</span>
<span class="co">#> </span>
-<span class="co">#> # S3: tbl_sqlite</span>
-<span class="co">#> year month day dep_time dep_delay arr_time arr_delay carrier tailnum</span>
-<span class="co">#> <int> <int> <int> <int> <dbl> <int> <dbl> <chr> <chr></span>
-<span class="co">#> 1 2013 1 1 848 853 1001 851 MQ N942MQ</span>
-<span class="co">#> 2 2013 1 1 1815 290 2120 338 EV N17185</span>
-<span class="co">#> 3 2013 1 1 1842 260 1958 263 EV N18120</span>
-<span class="co">#> 4 2013 1 1 2115 255 2330 250 9E N924XJ</span>
-<span class="co">#> ... with more rows, and 7 more variables: flight <int>, origin <chr>,</span>
-<span class="co">#> dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl></span>
+<span class="co">#> year month day dep_time sched_dep_time dep_delay arr_time</span>
+<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int></span>
+<span class="co">#> 1 2013 1 1 848 1835 853 1001</span>
+<span class="co">#> 2 2013 1 1 1815 1325 290 2120</span>
+<span class="co">#> 3 2013 1 1 1842 1422 260 1958</span>
+<span class="co">#> 4 2013 1 1 2115 1700 255 2330</span>
+<span class="co">#> # ... with more rows, and 12 more variables: sched_arr_time <int>,</span>
+<span class="co">#> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,</span>
+<span class="co">#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,</span>
+<span class="co">#> # minute <dbl>, time_hour <dbl></span>
<span class="kw">arrange</span>(flights_sqlite, year, month, day)
-<span class="co">#> Source: query [?? x 16]</span>
-<span class="co">#> Database: sqlite 3.8.6 [/private/tmp/RtmpduTf9z/Rinst1430371d1558/dplyr/db/nycflights13.sqlite]</span>
+<span class="co">#> Source: query [?? x 19]</span>
+<span class="co">#> Database: sqlite 3.11.1 [/home/muelleki/R/x86_64-pc-linux-gnu-library/3.3/dplyr/db/nycflights13.sqlite]</span>
<span class="co">#> </span>
-<span class="co">#> # S3: tbl_sqlite</span>
-<span class="co">#> year month day dep_time dep_delay arr_time arr_delay carrier tailnum</span>
-<span class="co">#> <int> <int> <int> <int> <dbl> <int> <dbl> <chr> <chr></span>
-<span class="co">#> 1 2013 1 1 517 2 830 11 UA N14228</span>
-<span class="co">#> 2 2013 1 1 533 4 850 20 UA N24211</span>
-<span class="co">#> 3 2013 1 1 542 2 923 33 AA N619AA</span>
-<span class="co">#> 4 2013 1 1 544 -1 1004 -18 B6 N804JB</span>
-<span class="co">#> ... with more rows, and 7 more variables: flight <int>, origin <chr>,</span>
-<span class="co">#> dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl></span>
+<span class="co">#> year month day dep_time sched_dep_time dep_delay arr_time</span>
+<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int></span>
+<span class="co">#> 1 2013 1 1 517 515 2 830</span>
+<span class="co">#> 2 2013 1 1 533 529 4 850</span>
+<span class="co">#> 3 2013 1 1 542 540 2 923</span>
+<span class="co">#> 4 2013 1 1 544 545 -1 1004</span>
+<span class="co">#> # ... with more rows, and 12 more variables: sched_arr_time <int>,</span>
+<span class="co">#> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,</span>
+<span class="co">#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,</span>
+<span class="co">#> # minute <dbl>, time_hour <dbl></span>
<span class="kw">mutate</span>(flights_sqlite, <span class="dt">speed =</span> air_time /<span class="st"> </span>distance)
-<span class="co">#> Source: query [?? x 17]</span>
-<span class="co">#> Database: sqlite 3.8.6 [/private/tmp/RtmpduTf9z/Rinst1430371d1558/dplyr/db/nycflights13.sqlite]</span>
+<span class="co">#> Source: query [?? x 20]</span>
+<span class="co">#> Database: sqlite 3.11.1 [/home/muelleki/R/x86_64-pc-linux-gnu-library/3.3/dplyr/db/nycflights13.sqlite]</span>
<span class="co">#> </span>
-<span class="co">#> # S3: tbl_sqlite</span>
-<span class="co">#> year month day dep_time dep_delay arr_time arr_delay carrier tailnum</span>
-<span class="co">#> <int> <int> <int> <int> <dbl> <int> <dbl> <chr> <chr></span>
-<span class="co">#> 1 2013 1 1 517 2 830 11 UA N14228</span>
-<span class="co">#> 2 2013 1 1 533 4 850 20 UA N24211</span>
-<span class="co">#> 3 2013 1 1 542 2 923 33 AA N619AA</span>
-<span class="co">#> 4 2013 1 1 544 -1 1004 -18 B6 N804JB</span>
-<span class="co">#> ... with more rows, and 8 more variables: flight <int>, origin <chr>,</span>
-<span class="co">#> dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,</span>
-<span class="co">#> speed <dbl></span>
+<span class="co">#> year month day dep_time sched_dep_time dep_delay arr_time</span>
+<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int></span>
+<span class="co">#> 1 2013 1 1 517 515 2 830</span>
+<span class="co">#> 2 2013 1 1 533 529 4 850</span>
+<span class="co">#> 3 2013 1 1 542 540 2 923</span>
+<span class="co">#> 4 2013 1 1 544 545 -1 1004</span>
+<span class="co">#> # ... with more rows, and 13 more variables: sched_arr_time <int>,</span>
+<span class="co">#> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,</span>
+<span class="co">#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,</span>
+<span class="co">#> # minute <dbl>, time_hour <dbl>, speed <dbl></span>
<span class="kw">summarise</span>(flights_sqlite, <span class="dt">delay =</span> <span class="kw">mean</span>(dep_time))
<span class="co">#> Source: query [?? x 1]</span>
-<span class="co">#> Database: sqlite 3.8.6 [/private/tmp/RtmpduTf9z/Rinst1430371d1558/dplyr/db/nycflights13.sqlite]</span>
+<span class="co">#> Database: sqlite 3.11.1 [/home/muelleki/R/x86_64-pc-linux-gnu-library/3.3/dplyr/db/nycflights13.sqlite]</span>
<span class="co">#> </span>
-<span class="co">#> # S3: tbl_sqlite</span>
<span class="co">#> delay</span>
<span class="co">#> <dbl></span>
<span class="co">#> 1 1349.11</span></code></pre></div>
@@ -185,26 +191,25 @@ c4 <-<span class="st"> </span><span class="kw">arrange</span>(c3, year, month
<p>Suprisingly, this sequence of operations never actually touches the database. It’s not until you ask for the data (e.g. by printing <code>c4</code>) that dplyr generates the SQL and requests the results from the database. Even then it only pulls down 10 rows.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">c4
<span class="co">#> Source: query [?? x 8]</span>
-<span class="co">#> Database: sqlite 3.8.6 [/private/tmp/RtmpduTf9z/Rinst1430371d1558/dplyr/db/nycflights13.sqlite]</span>
+<span class="co">#> Database: sqlite 3.11.1 [/home/muelleki/R/x86_64-pc-linux-gnu-library/3.3/dplyr/db/nycflights13.sqlite]</span>
<span class="co">#> </span>
-<span class="co">#> # S3: tbl_sqlite</span>
<span class="co">#> year month day carrier dep_delay air_time distance speed</span>
<span class="co">#> <int> <int> <int> <chr> <dbl> <dbl> <dbl> <dbl></span>
<span class="co">#> 1 2013 1 1 9E 0 189 1029 326.6667</span>
<span class="co">#> 2 2013 1 1 9E -9 57 228 240.0000</span>
<span class="co">#> 3 2013 1 1 9E -3 68 301 265.5882</span>
<span class="co">#> 4 2013 1 1 9E -6 57 209 220.0000</span>
-<span class="co">#> ... with more rows</span></code></pre></div>
+<span class="co">#> # ... with more rows</span></code></pre></div>
<p>To pull down all the results use <code>collect()</code>, which returns a <code>tbl_df()</code>:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">collect</span>(c4)
-<span class="co">#> # A tibble: 842 x 8</span>
+<span class="co">#> # A tibble: 842 × 8</span>
<span class="co">#> year month day carrier dep_delay air_time distance speed</span>
<span class="co">#> <int> <int> <int> <chr> <dbl> <dbl> <dbl> <dbl></span>
<span class="co">#> 1 2013 1 1 9E 0 189 1029 326.6667</span>
<span class="co">#> 2 2013 1 1 9E -9 57 228 240.0000</span>
<span class="co">#> 3 2013 1 1 9E -3 68 301 265.5882</span>
<span class="co">#> 4 2013 1 1 9E -6 57 209 220.0000</span>
-<span class="co">#> ... with 838 more rows</span></code></pre></div>
+<span class="co">#> # ... with 838 more rows</span></code></pre></div>
<p>You can see the query dplyr has generated by looking at the <code>query</code> component of the object:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">c4$query
<span class="co">#> NULL</span></code></pre></div>
@@ -220,73 +225,20 @@ c4 <-<span class="st"> </span><span class="kw">arrange</span>(c3, year, month
<span class="co">#> ORDER BY `year`, `month`, `day`, `carrier`</span>
<span class="co">#> </span>
<span class="co">#> <PLAN></span>
-<span class="co">#> addr opcode p1 p2 p3 p4 p5 comment</span>
-<span class="co">#> 1 0 Init 0 62 0 00 <NA></span>
-<span class="co">#> 2 1 SorterOpen 4 3 0 k(2,B,nil) 00 <NA></span>
-<span class="co">#> 3 2 OpenRead 3 100 0 16 00 <NA></span>
-<span class="co">#> 4 3 OpenRead 5 18795 0 k(4,nil,nil,nil,nil) 00 <NA></span>
-<span class="co">#> 5 4 Real 0 1 0 2013 00 <NA></span>
-<span class="co">#> 6 5 Real 0 2 0 1 00 <NA></span>
-<span class="co">#> 7 6 Real 0 3 0 1 00 <NA></span>
-<span class="co">#> 8 7 Affinity 1 3 0 ddd 00 <NA></span>
-<span class="co">#> 9 8 SeekGE 5 40 1 3 00 <NA></span>
-<span class="co">#> 10 9 IdxGT 5 40 1 3 00 <NA></span>
-<span class="co">#> 11 10 IdxRowid 5 4 0 00 <NA></span>
-<span class="co">#> 12 11 Seek 3 4 0 00 <NA></span>
-<span class="co">#> 13 12 Column 5 0 5 00 <NA></span>
-<span class="co">#> 14 13 Column 5 1 6 00 <NA></span>
-<span class="co">#> 15 14 Column 5 2 7 00 <NA></span>
-<span class="co">#> 16 15 Column 3 7 8 00 <NA></span>
-<span class="co">#> 17 16 Column 3 4 9 00 <NA></span>
-<span class="co">#> 18 17 RealAffinity 9 0 0 00 <NA></span>
-<span class="co">#> 19 18 Column 3 12 10 00 <NA></span>
-<span class="co">#> 20 19 RealAffinity 10 0 0 00 <NA></span>
-<span class="co">#> 21 20 Column 3 13 11 00 <NA></span>
-<span class="co">#> 22 21 RealAffinity 11 0 0 00 <NA></span>
-<span class="co">#> 23 22 Divide 10 11 13 00 <NA></span>
-<span class="co">#> 24 23 Multiply 15 13 12 00 <NA></span>
-<span class="co">#> 25 24 MakeRecord 5 8 13 00 <NA></span>
-<span class="co">#> 26 25 Column 5 0 17 00 <NA></span>
-<span class="co">#> 27 26 Column 5 1 18 00 <NA></span>
-<span class="co">#> 28 27 Column 5 2 19 00 <NA></span>
-<span class="co">#> 29 28 Column 3 7 20 00 <NA></span>
-<span class="co">#> 30 29 Sequence 4 21 0 00 <NA></span>
-<span class="co">#> 31 30 Move 13 22 1 00 <NA></span>
-<span class="co">#> 32 31 MakeRecord 20 3 16 00 <NA></span>
-<span class="co">#> 33 32 IfNot 21 37 0 00 <NA></span>
-<span class="co">#> 34 33 Compare 23 17 3 k(4,B,B,B,B) 00 <NA></span>
-<span class="co">#> 35 34 Jump 35 38 35 00 <NA></span>
-<span class="co">#> 36 35 Gosub 26 44 0 00 <NA></span>
-<span class="co">#> 37 36 ResetSorter 4 0 0 00 <NA></span>
-<span class="co">#> 38 37 Move 17 23 3 00 <NA></span>
-<span class="co">#> 39 38 SorterInsert 4 16 0 00 <NA></span>
-<span class="co">#> 40 39 Next 5 9 1 00 <NA></span>
-<span class="co">#> 41 40 Close 3 0 0 00 <NA></span>
-<span class="co">#> 42 41 Close 5 0 0 00 <NA></span>
-<span class="co">#> 43 42 Gosub 26 44 0 00 <NA></span>
-<span class="co">#> 44 43 Goto 0 61 0 00 <NA></span>
-<span class="co">#> 45 44 Once 0 47 0 00 <NA></span>
-<span class="co">#> 46 45 OpenPseudo 6 13 8 00 <NA></span>
-<span class="co">#> 47 46 OpenPseudo 7 27 3 00 <NA></span>
-<span class="co">#> 48 47 SorterSort 4 61 0 00 <NA></span>
-<span class="co">#> 49 48 SorterData 4 27 0 00 <NA></span>
-<span class="co">#> 50 49 Column 7 2 13 20 <NA></span>
-<span class="co">#> 51 50 Column 6 0 5 20 <NA></span>
-<span class="co">#> 52 51 Column 6 1 6 00 <NA></span>
-<span class="co">#> 53 52 Column 6 2 7 00 <NA></span>
-<span class="co">#> 54 53 Column 6 3 8 00 <NA></span>
-<span class="co">#> 55 54 Column 6 4 9 00 <NA></span>
-<span class="co">#> 56 55 Column 6 5 10 00 <NA></span>
-<span class="co">#> 57 56 Column 6 6 11 00 <NA></span>
-<span class="co">#> 58 57 Column 6 7 12 00 <NA></span>
-<span class="co">#> 59 58 ResultRow 5 8 0 00 <NA></span>
-<span class="co">#> 60 59 SorterNext 4 48 0 00 <NA></span>
-<span class="co">#> 61 60 Return 26 0 0 00 <NA></span>
-<span class="co">#> 62 61 Halt 0 0 0 00 <NA></span>
-<span class="co">#> 63 62 Transaction 0 0 16 0 01 <NA></span>
-<span class="co">#> 64 63 TableLock 0 100 0 flights 00 <NA></span>
-<span class="co">#> 65 64 Real 0 15 0 60 00 <NA></span>
-<span class="co">#> 66 65 Goto 0 1 0 00 <NA></span></code></pre></div>
+<span class="co">#> addr opcode p1 p2 p3 p4 p5 comment</span>
+<span class="co">#> 1 0 Init 0 56 0 00 NA</span>
+<span class="co">#> 2 1 SorterOpen 4 9 0 k(1,B) 00 NA</span>
+<span class="co">#> 3 2 OpenRead 3 100 0 19 00 NA</span>
+<span class="co">#> 4 3 OpenRead 5 22625 0 k(4,,,,) 02 NA</span>
+<span class="co">#> 5 4 Real 0 1 0 2013 00 NA</span>
+<span class="co">#> 6 5 Real 0 2 0 1 00 NA</span>
+<span class="co">#> 7 6 Real 0 3 0 1 00 NA</span>
+<span class="co">#> 8 7 Affinity 1 3 0 DDD 00 NA</span>
+<span class="co">#> 9 8 SeekGE 5 36 1 3 00 NA</span>
+<span class="co">#> 10 9 IdxGT 5 36 1 3 00 NA</span>
+<span class="co">#> 11 10 Seek 5 0 3 00 NA</span>
+<span class="co">#> 12 11 Column 5 0 8 00 NA</span>
+<span class="co">#> [ reached getOption("max.print") -- omitted 48 rows ]</span></code></pre></div>
<div id="forcing-computation" class="section level3">
<h3>Forcing computation</h3>
<p>There are three ways to force the computation of a query:</p>
@@ -348,8 +300,8 @@ c4 <-<span class="st"> </span><span class="kw">arrange</span>(c3, year, month
<p>Any function that dplyr doesn’t know how to convert is left as is. This means that database functions that are not covered by dplyr can be used directly via <code>translate_sql()</code>. Here a couple of examples that will work with <a href="http://www.sqlite.org/lang_corefunc.html">SQLite</a>:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">translate_sql</span>(<span class="kw">glob</span>(x, y))
<span class="co">#> <SQL> GLOB("x", "y")</span>
-<span class="kw">translate_sql</span>(x %like%<span class="st"> "ab*"</span>)
-<span class="co">#> <SQL> "x" LIKE 'ab*'</span></code></pre></div>
+<span class="kw">translate_sql</span>(x %like%<span class="st"> "ab%"</span>)
+<span class="co">#> <SQL> "x" LIKE 'ab%'</span></code></pre></div>
</div>
<div id="grouping" class="section level2">
<h2>Grouping</h2>
diff --git a/vignettes/disabled/benchmark-baseball.Rmd b/vignettes/disabled/benchmark-baseball.Rmd
deleted file mode 100644
index d0eda5d..0000000
--- a/vignettes/disabled/benchmark-baseball.Rmd
+++ /dev/null
@@ -1,182 +0,0 @@
-<!--
-%\VignetteEngine{knitr}
-%\VignetteIndexEntry{Baseball benchmarks}
--->
-
-```{r, echo = FALSE, message = FALSE}
-library(dplyr)
-library(microbenchmark)
-library(data.table)
-library(Lahman)
-knitr::opts_chunk$set(
- comment = "#>",
- error = FALSE,
- tidy = FALSE
-)
-
-options(digits = 3, microbenchmark.unit = "ms")
-```
-
-# Benchmarks: baseball data
-
-The purpose of these benchmarks is to be as fair as possible, to help understand the relatively performance tradeoffs of the different approaches. If you think my implementation of base or data.table equivalents is suboptimal, please let me know better ways.
-
-Also note that I consider any significant performance difference between `dt` and `dt_raw` to be a bug in dplyr: for individual operations there should be very little overhead to calling data.table via dplyr. However, data.table may be significantly faster when performing the same sequence of operations as dplyr. This is because currently dplyr uses an eager evaluation approach so the individual calls to `[.data.table` don't get as much information about the desired result as the single [...]
-
-Thanks go to Matt Dowle and Arun Srinivasan for their extensive feedback on these benchmarks.
-
-## Data setup
-
-The following benchmarks explore the performance on a somewhat realistic example: the `Batting` dataset from the Lahman package. It contains `r nrow(Batting)` records on the batting careers of `r length(Batting$playerID)` players from `r min(Batting$yearID)` to `r max(Batting$yearID)`.
-
-The first code block defines two alternative backends for the Batting dataset. Grouping operations are performed inline in each benchmark. This represents the common scenario where you group the data and immediately use it.
-
-```{r setup}
-batting_df <- tbl_df(Batting)
-batting_dt <- tbl_dt(Batting)
-```
-
-## Summarise
-
-Compute the average number of at bats for each player:
-
-```{r summarise-mean}
-microbenchmark(
- dplyr_df = batting_df %>% group_by(playerID) %>% summarise(ab = mean(AB)),
- dplyr_dt = batting_dt %>% group_by(playerID) %>% summarise(ab = mean(AB)),
- dt_raw = batting_dt[, list(ab = mean(AB)), by = playerID],
- base = tapply(batting_df$AB, batting_df$playerID, FUN = mean),
- times = 5
-)
-```
-
-NB: base implementation captures computation but not output format, giving considerably less output.
-
-However, this comparison is slightly unfair because both data.table and `summarise()` use tricks to find a more efficient implementation of `mean()`. Data table calls a `C` implementation of the `mean (using `.External(Cfastmean, B, FALSE)` and thus avoiding the overhead of S3 method dispatch). `dplyr::summarise` uses a hybrid evaluation technique, where common functions are implemented purely in C++, avoiding R function call overhead.
-
-```{r sumarise-mean_}
-mean_ <- function(x) .Internal(mean(x))
-microbenchmark(
- dplyr_df = batting_df %>% group_by(playerID) %>% summarise(ab = mean_(AB)),
- dplyr_dt = batting_dt %>% group_by(playerID) %>% summarise(ab = mean_(AB)),
- dt_raw = batting_dt[, list(ab = mean_(AB)), by = playerID],
- base = tapply(batting_df$AB, batting_df$playerID, FUN = mean_),
- times = 5
-)
-```
-
-## Arrange
-
-Arrange by year within each player:
-
-```{r arrange}
-microbenchmark(
- dplyr_df = batting_df %>% arrange(playerID, yearID),
- dplyr_dt = batting_dt %>% arrange(playerID, yearID),
- dt_raw = setkey(copy(batting_dt), playerID, yearID),
- base = batting_dt[order(batting_df$playerID, batting_df$yearID), ],
- times = 2
-)
-```
-
-## Filter
-
-Find the year for which each player played the most games:
-
-```{r filter}
-microbenchmark(
- dplyr_df = batting_df %>% group_by(playerID) %>% filter(G == max(G)),
- dplyr_dt = batting_dt %>% group_by(playerID) %>% filter(G == max(G)),
- dt_raw = batting_dt[batting_dt[, .I[G == max(G)], by = playerID]$V1],
- base = batting_df[ave(batting_df$G, batting_df$playerID, FUN = max) ==
- batting_df$G, ],
- times = 2
-)
-```
-
-I'm not aware of a single line data table equivalent ([see SO 16573995](http://stackoverflow.com/questions/16573995/)). Suggetions welcome. dplyr currently doesn't support hybrid evaluation for logical comparison, but it is scheduled for 0.2 (see [#113](https://github.com/hadley/dplyr/issues/113)), this should give an additional speed up.
-
-## Mutate
-
-Rank years based on number of at bats:
-
-```{r mutate}
-microbenchmark(
- dplyr_df = batting_df %>% group_by(playerID) %>% mutate(r = rank(desc(AB))),
- dplyr_dt = batting_dt %>% group_by(playerID) %>% mutate(r = rank(desc(AB))),
- dt_raw = copy(batting_dt)[, rank := rank(desc(AB)), by = playerID],
- times = 2
-)
-```
-
-(The `dt_raw` code needs to explicitly copy the data.table so the it doesn't modify in place, as is the data.table default. This is an example where it's difficult to compare data.table and dplyr directly because of different underlying philosophies.)
-
-Compute year of career:
-
-```{r mutate2}
-microbenchmark(
- dplyr_df = batting_df %>% group_by(playerID) %>%
- mutate(cyear = yearID - min(yearID) + 1),
- dplyr_dt = batting_dt %>% group_by(playerID) %>%
- mutate(cyear = yearID - min(yearID) + 1),
- dt_raw = copy(batting_dt)[, cyear := yearID - min(yearID) + 1,
- by = playerID],
- times = 5
-)
-```
-
-Rank is a relatively expensive operation and `min()` is relatively cheap, showing the the relative performance overhead of the difference techniques.
-
-dplyr currently has some support for hybrid evaluation of window functions. This yields substantial speed-ups where available:
-
-```{r mutate_hybrid}
-min_rank_ <- min_rank
-microbenchmark(
- hybrid = batting_df %>% group_by(playerID) %>% mutate(r = min_rank(AB)),
- regular = batting_df %>% group_by(playerID) %>% mutate(r = min_rank_(AB)),
- times = 2
-)
-```
-
-## Joins
-
-We conclude with some quick comparisons of joins. First we create two new datasets: `master` which contains demographic information on each player, and `hall_of_fame` which contains all players inducted into the hall of fame.
-
-```{r}
-master_df <- tbl_df(Master) %>% select(playerID, birthYear)
-hall_of_fame_df <- tbl_df(HallOfFame) %>% filter(inducted == "Y") %>%
- select(playerID, votedBy, category)
-
-master_dt <- tbl_dt(Master) %>% select(playerID, birthYear)
-hall_of_fame_dt <- tbl_dt(HallOfFame) %>% filter(inducted == "Y") %>%
- select(playerID, votedBy, category)
-```
-
-
-```{r}
-microbenchmark(
- dplyr_df = left_join(master_df, hall_of_fame_df, by = "playerID"),
- dplyr_dt = left_join(master_dt, hall_of_fame_dt, by = "playerID"),
- base = merge(master_df, hall_of_fame_df, by = "playerID", all.x = TRUE),
- times = 10
-)
-
-microbenchmark(
- dplyr_df = inner_join(master_df, hall_of_fame_df, by = "playerID"),
- dplyr_dt = inner_join(master_dt, hall_of_fame_dt, by = "playerID"),
- base = merge(master_df, hall_of_fame_df, by = "playerID"),
- times = 10
-)
-
-microbenchmark(
- dplyr_df = semi_join(master_df, hall_of_fame_df, by = "playerID"),
- dplyr_dt = semi_join(master_dt, hall_of_fame_dt, by = "playerID"),
- times = 10
-)
-
-microbenchmark(
- dplyr_df = anti_join(master_df, hall_of_fame_df, by = "playerID"),
- dplyr_dt = anti_join(master_dt, hall_of_fame_dt, by = "playerID"),
- times = 10
-)
-```
diff --git a/inst/doc/introduction.Rmd b/vignettes/dplyr.Rmd
similarity index 54%
rename from inst/doc/introduction.Rmd
rename to vignettes/dplyr.Rmd
index 354320d..346e114 100644
--- a/inst/doc/introduction.Rmd
+++ b/vignettes/dplyr.Rmd
@@ -1,6 +1,5 @@
---
title: "Introduction to dplyr"
-date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Introduction to dplyr}
@@ -13,6 +12,7 @@ knitr::opts_chunk$set(collapse = T, comment = "#>")
options(tibble.print_min = 4L, tibble.print_max = 4L)
library(dplyr)
library(ggplot2)
+set.seed(1014)
```
When working with data you must:
@@ -25,53 +25,42 @@ When working with data you must:
The dplyr package makes these steps fast and easy:
-* By constraining your options, it simplifies how you can think about common data manipulation tasks.
+* By constraining your options, it helps you think about your data manipulation
+ challenges.
-* It provides simple "verbs", functions that correspond to the most common data manipulation tasks, to help you translate those thoughts into code.
+* It provides simple "verbs", functions that correspond to the most common data
+ manipulation tasks, to help you translate your thoughts into code.
-* It uses efficient data storage backends, so you spend less time waiting for the computer.
+* It uses efficient backends, so you spend less time waiting for the computer.
-This document introduces you to dplyr's basic set of tools, and shows you how to apply them to data frames. Other vignettes provide more details on specific topics:
-
-* databases: Besides in-memory data frames, dplyr also connects to out-of-memory, remote databases. And by translating your R code into the appropriate SQL, it allows you to work with both types of data using the same set of tools.
-
-* benchmark-baseball: see how dplyr compares to other tools for data
- manipulation on a realistic use case.
-
-* window-functions: a window function is a variation on an aggregation
- function. Where an aggregate function uses `n` inputs to produce 1
- output, a window function uses `n` inputs to produce `n` outputs.
+This document introduces you to dplyr's basic set of tools, and shows you how to apply them to data frames. dplyr also supports databases via the dbplyr package, once you've installed, read `vignette("dbplyr")` to learn more.
## Data: nycflights13
-To explore the basic data manipulation verbs of dplyr, we'll start with the built in
-`nycflights13` data frame. This dataset contains all `r nrow(nycflights13::flights)` flights that departed from New York City in 2013. The data comes from the US [Bureau of Transportation Statistics](http://www.transtats.bts.gov/DatabaseInfo.asp?DB_ID=120&Link=0), and is documented in `?nycflights13`
+To explore the basic data manipulation verbs of dplyr, we'll use `nycflights13::flights`. This dataset contains all `r nrow(nycflights13::flights)` flights that departed from New York City in 2013. The data comes from the US [Bureau of Transportation Statistics](http://www.transtats.bts.gov/DatabaseInfo.asp?DB_ID=120&Link=0), and is documented in `?nycflights13`
```{r}
library(nycflights13)
dim(flights)
-head(flights)
+flights
```
-dplyr can work with data frames as is, but if you're dealing with large data, it's worthwhile to convert them to a `tbl_df`: this is a wrapper around a data frame that won't accidentally print a lot of data to the screen.
+Note that `nycflights13::flights` is a tibble, a modern reimagining of the data frame. It's particular useful for large datasets because it only prints the first few rows. You can learn more about tibbles at <http://tibble.tidyverse.org>; in particular you can convert data frames to tibbles with `as_tibble()`.
## Single table verbs
Dplyr aims to provide a function for each basic verb of data manipulation:
-* `filter()` (and `slice()`)
-* `arrange()`
-* `select()` (and `rename()`)
-* `distinct()`
-* `mutate()` (and `transmute()`)
-* `summarise()`
-* `sample_n()` (and `sample_frac()`)
-
-If you've used plyr before, many of these will be familar.
+* `filter()` to select cases based on their values.
+* `arrange()` to reorder the cases.
+* `select()` and `rename()` to select variables based on their names.
+* `mutate()` and `transmute()` to add new variables that are functions of existing variables.
+* `summarise()` to condense multiple values to a single value.
+* `sample_n()` and `sample_frac()` to take random samples.
-## Filter rows with `filter()`
+### Filter rows with `filter()`
-`filter()` allows you to select a subset of rows in a data frame. The first argument is the name of the data frame. The second and subsequent arguments are the expressions that filter the data frame:
+`filter()` allows you to select a subset of rows in a data frame. Like all single verbs, the first argument is the tibble (or data frame). The second and subsequent arguments refer to variables within that data frame, selecting rows where the expression is `TRUE`.
For example, we can select all flights on January 1st with:
@@ -79,25 +68,13 @@ For example, we can select all flights on January 1st with:
filter(flights, month == 1, day == 1)
```
-This is equivalent to the more verbose code in base R:
+This is rougly equivalent to this base R code:
```{r, eval = FALSE}
flights[flights$month == 1 & flights$day == 1, ]
```
-`filter()` works similarly to `subset()` except that you can give it any number of filtering conditions, which are joined together with `&` (not `&&` which is easy to do accidentally!). You can also use other boolean operators:
-
-```{r, eval = FALSE}
-filter(flights, month == 1 | month == 2)
-```
-
-To select rows by position, use `slice()`:
-
-```{r}
-slice(flights, 1:10)
-```
-
-## Arrange rows with `arrange()`
+### Arrange rows with `arrange()`
`arrange()` works similarly to `filter()` except that instead of filtering or selecting rows, it reorders them. It takes a data frame, and a set of column names (or more complicated expressions) to order by. If you provide more than one column name, each additional column will be used to break ties in the values of preceding columns:
@@ -111,14 +88,7 @@ Use `desc()` to order a column in descending order:
arrange(flights, desc(arr_delay))
```
-`dplyr::arrange()` works the same way as `plyr::arrange()`. It's a straightforward wrapper around `order()` that requires less typing. The previous code is equivalent to:
-
-```{r, eval = FALSE}
-flights[order(flights$year, flights$month, flights$day), ]
-flights[order(flights$arr_delay, decreasing = TRUE), ] or flights[order(-flights$arr_delay), ]
-```
-
-## Select columns with `select()`
+### Select columns with `select()`
Often you work with large datasets with many columns but only a few are actually of interest to you. `select()` allows you to rapidly zoom in on a useful subset using operations that usually only work on numeric variable positions:
@@ -131,8 +101,6 @@ select(flights, year:day)
select(flights, -(year:day))
```
-This function works similarly to the `select` argument in `base::subset()`. Because the dplyr philosophy is to have small functions that do one thing well, it's its own function in dplyr.
-
There are a number of helper functions you can use within `select()`, like `starts_with()`, `ends_with()`, `matches()` and `contains()`. These let you quickly match larger blocks of variables that meet some criterion. See `?select` for more details.
You can rename variables with `select()` by using named arguments:
@@ -147,28 +115,18 @@ But because `select()` drops all the variables not explicitly mentioned, it's no
rename(flights, tail_num = tailnum)
```
-## Extract distinct (unique) rows
-
-Use `distinct()`to find unique values in a table:
-
-```{r}
-distinct(flights, tailnum)
-distinct(flights, origin, dest)
-```
-
-(This is very similar to `base::unique()` but should be much faster.)
-
-## Add new columns with `mutate()`
+### Add new columns with `mutate()`
Besides selecting sets of existing columns, it's often useful to add new columns that are functions of existing columns. This is the job of `mutate()`:
```{r}
mutate(flights,
gain = arr_delay - dep_delay,
- speed = distance / air_time * 60)
+ speed = distance / air_time * 60
+)
```
-`dplyr::mutate()` works the same way as `plyr::mutate()` and similarly to `base::transform()`. The key difference between `mutate()` and `transform()` is that mutate allows you to refer to columns that you've just created:
+`dplyr::mutate()` is similar to the base `transform()`, but allows you to refer to columns that you've just created:
```{r}
mutate(flights,
@@ -177,14 +135,6 @@ mutate(flights,
)
```
-```{r, eval = FALSE}
-transform(flights,
- gain = arr_delay - delay,
- gain_per_hour = gain / (air_time / 60)
-)
-#> Error: object 'gain' not found
-```
-
If you only want to keep the new variables, use `transmute()`:
```{r}
@@ -194,18 +144,19 @@ transmute(flights,
)
```
-## Summarise values with `summarise()`
+### Summarise values with `summarise()`
-The last verb is `summarise()`. It collapses a data frame to a single row (this is exactly equivalent to `plyr::summarise()`):
+The last verb is `summarise()`. It collapses a data frame to a single row.
```{r}
summarise(flights,
- delay = mean(dep_delay, na.rm = TRUE))
+ delay = mean(dep_delay, na.rm = TRUE)
+)
```
-Below, we'll see how this verb can be very useful.
+It's not that useful until we learn the `group_by()` verb below.
-## Randomly sample rows with `sample_n()` and `sample_frac()`
+### Randomly sample rows with `sample_n()` and `sample_frac()`
You can use `sample_n()` and `sample_frac()` to take a random sample of rows: use `sample_n()` for a fixed number and `sample_frac()` for a fixed fraction.
@@ -216,14 +167,14 @@ sample_frac(flights, 0.01)
Use `replace = TRUE` to perform a bootstrap sample. If needed, you can weight the sample with the `weight` argument.
-## Commonalities
+### Commonalities
You may have noticed that the syntax and function of all these verbs are very similar:
* The first argument is a data frame.
-* The subsequent arguments describe what to do with the data frame. Notice that you can refer
- to columns in the data frame directly without using `$`.
+* The subsequent arguments describe what to do with the data frame. You can
+ refer to columns in the data frame directly without using `$`.
* The result is a new data frame
@@ -231,16 +182,32 @@ Together these properties make it easy to chain together multiple simple steps t
These five functions provide the basis of a language of data manipulation. At the most basic level, you can only alter a tidy data frame in five useful ways: you can reorder the rows (`arrange()`), pick observations and variables of interest (`filter()` and `select()`), add new variables that are functions of existing variables (`mutate()`), or collapse many values to a summary (`summarise()`). The remainder of the language comes from applying the five functions to different types of dat [...]
-# Grouped operations
-These verbs are useful on their own, but they become really powerful when you apply them to groups of observations within a dataset. In dplyr, you do this by with the `group_by()` function. It breaks down a dataset into specified groups of rows. When you then apply the verbs above on the resulting object they'll be automatically applied "by group". Most importantly, all this is achieved by using the same exact syntax you'd use with an ungrouped object.
+## Patterns of operations
+
+The dplyr verbs can be classified by the type of operations they
+accomplish (we sometimes speak of their **semantics**, i.e., their
+meaning). The most important and useful distinction is between grouped
+and ungrouped operations. In addition, it is helpful to have a good
+grasp of the difference between select and mutate operations.
+
+
+### Grouped operations
+
+The dplyr verbs are useful on their own, but they become even more
+powerful when you apply them to groups of observations within a
+dataset. In dplyr, you do this with the `group_by()` function. It
+breaks down a dataset into specified groups of rows. When you then
+apply the verbs above on the resulting object they'll be automatically
+applied "by group".
Grouping affects the verbs as follows:
* grouped `select()` is the same as ungrouped `select()`, except that
grouping variables are always retained.
-* grouped `arrange()` orders first by the grouping variables
+* grouped `arrange()` is the same as ungrouped; unless you set
+ `.by_group = TRUE`, in which case it orders first by the grouping variables
* `mutate()` and `filter()` are most useful in conjunction with window
functions (like `rank()`, or `min(x) == x`). They are described in detail in
@@ -248,13 +215,10 @@ Grouping affects the verbs as follows:
* `sample_n()` and `sample_frac()` sample the specified number/fraction of
rows in each group.
-
-* `slice()` extracts rows within each group.
-
-* `summarise()` is powerful and easy to understand, as described in
- more detail below.
-In the following example, we split the complete dataset into individual planes and then summarise each plane by counting the number of flights (`count = n()`) and computing the average distance (`dist = mean(Distance, na.rm = TRUE)`) and arrival delay (`delay = mean(ArrDelay, na.rm = TRUE)`). We then use ggplot2 to display the output.
+* `summarise()` computes the summary for each group.
+
+In the following example, we split the complete dataset into individual planes and then summarise each plane by counting the number of flights (`count = n()`) and computing the average distance (`dist = mean(distance, na.rm = TRUE)`) and arrival delay (`delay = mean(arr_delay, na.rm = TRUE)`). We then use ggplot2 to display the output.
```{r, warning = FALSE, message = FALSE, fig.width = 6}
by_tailnum <- group_by(flights, tailnum)
@@ -292,8 +256,6 @@ summarise(destinations,
)
```
-You can also use any function that you write yourself. For performance, dplyr provides optimised C++ versions of many of these functions. If you want to provide your own C++ function, see the hybrid-evaluation vignette for more details.
-
When you group by multiple variables, each summary peels off one level of the grouping. That makes it easy to progressively roll-up a dataset:
```{r}
@@ -305,7 +267,166 @@ daily <- group_by(flights, year, month, day)
However you need to be careful when progressively rolling up summaries like this: it's ok for sums and counts, but you need to think about weighting for means and variances (it's not possible to do this exactly for medians).
-## Chaining
+
+### Selecting operations
+
+One of the appealing features of dplyr is that you can refer to
+columns from the tibble as if they were regular variables. However,
+the syntactic uniformity of referring to bare column names hide
+semantical differences across the verbs. A column symbol supplied to
+`select()` does not have the same meaning as the same symbol supplied
+to `mutate()`.
+
+Selecting operations expect column names and positions. Hence, when
+you call `select()` with bare variable names, they actually represent
+their own positions in the tibble. The following calls are completely
+equivalent from dplyr's point of view:
+
+```{r}
+# `year` represents the integer 1
+select(flights, year)
+select(flights, 1)
+```
+
+By the same token, this means that you cannot refer to variables from
+the surrounding context if they have the same name as one of the
+columns. In the following example, `year` still represents 1, not 5:
+
+```r
+year <- 5
+select(flights, year)
+```
+
+One useful subtlety is that this only applies to bare names and to
+selecting calls like `c(year, month, day)` or `year:day`. In all other
+cases, the columns of the data frame are not put in scope. This allows
+you to refer to contextual variables in selection helpers:
+
+```{r}
+year <- "dep"
+select(flights, starts_with(year))
+```
+
+These semantics are usually intuitive. But note the subtle difference:
+
+```{r}
+year <- 5
+select(flights, year, identity(year))
+```
+
+In the first argument, `year` represents its own position `1`. In the
+second argument, `year` is evaluated in the surrounding context and
+represents the fifth column.
+
+For a long time, `select()` used to only understand column positions.
+Counting from dplyr 0.6, it now understands column names as well. This
+makes it a bit easier to program with `select()`:
+
+```{r}
+vars <- c("year", "month")
+select(flights, vars, "day")
+```
+
+Note that the code above is somewhat unsafe because you might have
+added a column named `vars` to the tibble, or you might apply the code
+to another data frame containing such a column. To avoid this issue,
+you can wrap the variable in an `identity()` call as we mentioned
+above, as this will bypass column names. However, a more explicit and
+general method that works in all dplyr verbs is to unquote the
+variable with the `!!` operator. This tells dplyr to bypass the data
+frame and to directly look in the context:
+
+```{r}
+# Let's create a new `vars` column:
+flights$vars <- flights$year
+
+# The new column won't be an issue if you evaluate `vars` in the
+# context with the `!!` operator:
+vars <- c("year", "month", "day")
+select(flights, !! vars)
+```
+
+This operator is very useful when you need to use dplyr within custom
+functions. You can learn more about it in `vignette("programming")`.
+However it is important to understand the semantics of the verbs you
+are unquoting into, that is, the values they understand. As we have
+just seen, `select()` supports names and positions of columns. But
+that won't be the case in other verbs like `mutate()` because they
+have different semantics.
+
+
+### Mutating operations
+
+Mutate semantics are quite different from selection semantics. Whereas
+`select()` expects column names or positions, `mutate()` expects
+*column vectors*. Let's create a smaller tibble for clarity:
+
+```{r}
+df <- select(flights, year:dep_time)
+```
+
+When we use `select()`, the bare column names stand for ther own
+positions in the tibble. For `mutate()` on the other hand, column
+symbols represent the actual column vectors stored in the tibble.
+Consider what happens if we give a string or a number to `mutate()`:
+
+```{r}
+mutate(df, "year", 2)
+```
+
+`mutate()` gets length-1 vectors that it interprets as new columns in
+the data frame. These vectors are recycled so they match the number of
+rows. That's why it doesn't make sense to supply expressions like
+`"year" + 10` to `mutate()`. This amounts to adding 10 to a string!
+The correct expression is:
+
+```{r}
+mutate(df, year + 10)
+```
+
+In the same way, you can unquote values from the context if these
+values represent a valid column. They must be either length 1 (they
+then get recycled) or have the same length as the number of rows. In
+the following example we create a new vector that we add to the data
+frame:
+
+```{r}
+var <- seq(1, nrow(df))
+mutate(df, new = var)
+```
+
+A case in point is `group_by()`. While you might think it has select
+semantics, it actually has mutate semantics. This is quite handy as it
+allows to group by a modified column:
+
+```{r}
+group_by(df, month)
+group_by(df, month = as.factor(month))
+group_by(df, day_binned = cut(day, 3))
+```
+
+This is why you can't supply a column name to `group_by()`. This
+amounts to creating a new column containing the string recycled to the
+number of rows:
+
+```{r}
+group_by(df, "month")
+```
+
+Since grouping with select semantics can be sometimes useful as well,
+we have added the `group_by_at()` variant. In dplyr, variants suffixed
+with `_at()` support selection semantics in their second argument. You
+just need to wrap the selection with `vars()`:
+
+```{r}
+group_by_at(df, vars(year:day))
+```
+
+You can read more about the `_at()` and `_if()` variants in the
+`?scoped` help page.
+
+
+## Piping
The dplyr API is functional in the sense that function calls don't have side-effects. You must always save their results. This doesn't lead to particularly elegant code, especially if you want to do many operations at once. You either have to do it step-by-step:
@@ -318,7 +439,7 @@ a3 <- summarise(a2,
a4 <- filter(a3, arr > 30 | dep > 30)
```
-Or if you don't want to save the intermediate results, you need to wrap the function calls inside each other:
+Or if you don't want to name the intermediate results, you need to wrap the function calls inside each other:
```{r}
filter(
@@ -334,7 +455,7 @@ filter(
)
```
-This is difficult to read because the order of the operations is from inside to out. Thus, the arguments are a long way away from the function. To get around this problem, dplyr provides the `%>%` operator. `x %>% f(y)` turns into `f(x, y)` so you can use it to rewrite multiple operations that you can read left-to-right, top-to-bottom:
+This is difficult to read because the order of the operations is from inside to out. Thus, the arguments are a long way away from the function. To get around this problem, dplyr provides the `%>%` operator from magrittr. `x %>% f(y)` turns into `f(x, y)` so you can use it to rewrite multiple operations that you can read left-to-right, top-to-bottom:
```{r, eval = FALSE}
flights %>%
@@ -347,11 +468,11 @@ flights %>%
filter(arr > 30 | dep > 30)
```
-# Other data sources
+## Other data sources
As well as data frames, dplyr works with data that is stored in other ways, like data tables, databases and multidimensional arrays.
-## Data table
+### Data table
dplyr also provides [data table](http://datatable.r-forge.r-project.org/) methods for all verbs through [dtplyr](http://github.com/hadley/dtplyr). If you're using data.tables already this lets you to use dplyr syntax for data manipulation, and data.table for everything else.
@@ -366,21 +487,15 @@ The advantages of using dplyr with data tables are:
* Instead of one complex method built on the subscripting operator (`[`),
it provides many simple methods.
-## Databases
-
-dplyr also allows you to use the same verbs with a remote database. It takes care of generating the SQL for you so that you can avoid the cognitive challenge of constantly switching between languages. See the databases vignette for more details.
-
-Compared to DBI and the database connection algorithms:
+### Databases
-* it hides, as much as possible, the fact that you're working with a remote database
-* you don't need to know any SQL (although it helps!)
-* it abstracts over the many differences between the different DBI implementations
+dplyr also allows you to use the same verbs with a remote database. It takes care of generating the SQL for you so that you can avoid the cognitive challenge of constantly switching between languages. To use these capabilities, you'll need to install the dbplyr package and then read `vignette("dbplyr")` for the details.
-## Multidimensional arrays / cubes
+### Multidimensional arrays / cubes
`tbl_cube()` provides an experimental interface to multidimensional arrays or data cubes. If you're using this form of data in R, please get in touch so I can better understand your needs.
-# Comparisons
+## Comparisons
Compared to all existing options, dplyr:
diff --git a/vignettes/hybrid-evaluation.Rmd b/vignettes/hybrid-evaluation.Rmd
deleted file mode 100644
index 8135895..0000000
--- a/vignettes/hybrid-evaluation.Rmd
+++ /dev/null
@@ -1,344 +0,0 @@
----
-title: "Hybrid evaluation"
-date: "`r Sys.Date()`"
-output: rmarkdown::html_vignette
-vignette: >
- %\VignetteIndexEntry{Hybrid evaluation}
- %\VignetteEngine{knitr::rmarkdown}
- \usepackage[utf8]{inputenc}
----
-
-```{r, echo = FALSE, message = FALSE}
-knitr::opts_chunk$set(collapse = T, comment = "#>")
-options(tibble.print_min = 4L, tibble.print_max = 4L)
-library(dplyr)
-```
-
-Consider this call to `summarise` :
-
-```{r, eval = FALSE}
-summarise(per_day, flights = sum(flights))
-```
-
-One of the ways `dplyr` achieves dramatic speedups is that expressions might not
-be evaluated by R, but by alternative code that is faster and uses less memory.
-
-Conceptually the call to `summarise` above evaluates the expression `sum(flights)`
-on each subset of `flights` controlled by the grouping of `per_day`. This involves
-creating a new R vector to store the chunk and evaluate the R expression.
-
-Evaluating the R expression might carry costs that can be avoided, i.e.
-S3 dispatch, ...
-
-`dplyr` recognizes the expression `sum(flights)` as the `sum` function applied
-to a known column of the data, making it possible to handle the dispatch
-early and once, avoid unneeded memory allocations and compute the result faster.
-
-Hybrid evaluation is able to work on subexpressions. Consider:
-
-```{r, eval=FALSE}
-foo <- function(x) x*x
-summarise(per_day, flights = foo(sum(flights)) )
-```
-
-`dplyr` will substitute the subexpressions it knows how to handle and leave the
-rest to standard R evaluation. Instead of evaluating `foo(sum(flights))`, R will
-only have to evaluate `foo(z)` where `z` is the result of the internal evaluation
-of `sum(flights)`.
-
-# Implementation
-
-Hybrid evaluation is designed to be extensible. Before we start registering
-custom hybrid evaluation handlers, we need to understand the system.
-
-The first building block we need to cover is the `Result` class.
-
-```cpp
-namespace dplyr {
- class Result {
- public:
- Result(){}
- virtual ~Result(){} ;
-
- virtual SEXP process( const GroupedDataFrame& gdf) = 0 ;
-
- virtual SEXP process( const FullDataFrame& df ) = 0 ;
-
- virtual SEXP process( const SlicingIndex& index ){
- return R_NilValue ;
- }
-
- } ;
-}
-```
-
-The two first methods deal with grouped and ungrouped data frames.
-We will mainly focus on the last method that operates on a `SlicingIndex`.
-
-`SlicingIndex` is a class that encapsulates indices of a single chunk of a grouped
-data frame.
-
-Hybrid evaluation really just is deriving from the `Result` class. Let's consider
-a simpler version of `sum` that only handles numeric vectors. (The internal version
-is more complete, handles missing values, ...).
-
-```cpp
-class Sum : public Result {
- public:
- Sum( NumericVector data_ ): data(data_){}
-
- SEXP process( const SlicingIndex& index ){
- double res = 0.0 ;
- for( int i=0; i<index.size(); i++) res += data[ index[i] ] ;
- return NumericVector::create( res );
- }
-
- virtual SEXP process( const GroupedDataFrame& gdf){
- ...
- }
- virtual SEXP process( const FullDataFrame& df ){
- ...
- }
-
- private:
- NumericVector data ;
-} ;
-```
-
-## Using Processor
-
-Implementation of `Result` derived classes can be facilitated by the template
-class `Processor`. `Processor` is templated by two template parameters:
-
- - the R output type (`REALSXP`, `STRSXP`, ...)
- - the class you are defining. (This uses the CRTP pattern).
-
-When using `Processor` we only have to supply a `process_chunk` method
-that takes a `const SlicingIndex&` as input and returns an object suitable to
-go into a vector of the type controlled by the first template parameter.
-
-The purpose of the `Processor` template is then to generate the boiler plate
-code for the three `process` methods defined by the `Result` interface.
-
-A possible `Sum` implementation would then look something like this:
-
-```cpp
-class Sum : public Processor<REALSXP, Sum> {
- public:
- Sum( NumericVector data_ ): data(data_){}
-
- double process_chunk( const SlicingIndex& index ){
- double res = 0.0 ;
- for( int i=0; i<index.size(); i++) res += data[ index[i] ] ;
- return res;
- }
-
- private:
- NumericVector data ;
-}
-```
-
-Recognizing genericity here, we might want to make `Sum` a template class
-in order to handle more than just numeric vector :
-
-```cpp
-template <int RTYPE>
-class Sum : public Processor<REALSXP, Sum<RTYPE> > {
- public:
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-
- Sum( Vector<RTYPE> data_ ): data(data_){}
-
- STORAGE process_chunk( const SlicingIndex& index ){
- STORAGE res = 0.0 ;
- for( int i=0; i<index.size(); i++) res += data[ index[i] ] ;
- return res;
- }
-
- private:
- Vector<RTYPE> data ;
-}
-```
-
-Aside from not dealing with missing data and using internal knowledge of the
-`SlicingIndex` class, this implementation of `Sum` is close to the
-internal implementation in `dplyr`.
-
-## Retrieving hybrid handlers
-
-`dplyr` functions use the `get_handler` function to retrieve handlers for
-particular expressions.
-
-```cpp
-Result* get_handler( SEXP call, const LazySubsets& subsets ){
- int depth = Rf_length(call) ;
- HybridHandlerMap& handlers = get_handlers() ;
- SEXP fun_symbol = CAR(call) ;
- if( TYPEOF(fun_symbol) != SYMSXP ) return 0 ;
-
- HybridHandlerMap::const_iterator it = handlers.find( fun_symbol ) ;
- if( it == handlers.end() ) return 0 ;
-
- return it->second( call, subsets, depth - 1 );
-}
-```
-
-The `get_handler` performs a lookup in a hash table of type `HybridHandlerMap`.
-
-```cpp
-typedef dplyr::Result* (*HybridHandler)(SEXP, const dplyr::LazySubsets&, int) ;
-typedef dplyr_hash_map<SEXP,HybridHandler> HybridHandlerMap ;
-```
-
-`HybridHandlerMap` is simply a hash map where the map key is the symbol of the
-function and the map value type is a function pointer defined by `HybridHandler`.
-
-The parameters of the `HybridHandler` function pointer type are:
-
- - The call we want to hybridify, e.g. something like `sum(flights)`
- - a `LazySubsets` reference. The only thing that is relevant about
- this class here is that it defines a `get_variable` method that takes a
- symbol `SEXP` and returns the corresponding variable from the data frame.
- - The number of arguments of the call. For example for `sum(flights)`, the number
- of arguments is `1`.
-
-The purpose of the hybrid handler function is to return a `Result*` if it can handle
-the call or 0 if it cannot.
-
-with our previous `Sum` template class, we could define a hybrid handler function
-like this:
-
-```cpp
-Result* sum_handler(SEXP call, const LazySubsets& subsets, int nargs ){
- // we only know how to deal with argument
- if( nargs != 1 ) return 0 ;
-
- // get the first argument
- SEXP arg = CADR(call) ;
-
- // if this is a symbol, extract the variable from the subsets
- if( TYPEOF(arg) == SYMSXP ) arg = subsets.get_variable(arg) ;
-
- // we know how to handle integer vectors and numeric vectors
- switch( TYPEOF(arg) ){
- case INTSXP: return new Sum<INTSXP>(arg) ;
- case REALSXP: return new Sum<REALSXP>(arg) ;
- default: break ;
- }
-
- // we are here if we could not handle the call
- return 0 ;
-}
-```
-
-## Registering hybrid handlers
-
-`dplyr` enables users, most likely packages, to register their own hybrid handlers
-through the `registerHybridHandler`.
-
-```cpp
-void registerHybridHandler( const char* , HybridHandler ) ;
-```
-
-To register the handler we created above, we then simply:
-
-```cpp
-registerHybridHandler( "sum", sum_handler ) ;
-```
-
-## Putting it all together
-
-We are going to register a handler called `hitchhiker` that always returns the
-answer to everything, i.e. `42`.
-
-The code below is suitable to run through `sourceCpp`.
-
-```cpp
-#include <dplyr.h>
-// [[Rcpp::depends(dplyr,BH)]]
-
-using namespace dplyr ;
-using namespace Rcpp ;
-
-// the class that derives from Result through Processor
-class Hitchhiker : public Processor<INTSXP,Hitchhiker>{
-public:
-
- // always returns 42, as it is the answer to everything
- int process_chunk( const SlicingIndex& ){
- return 42 ;
- }
-} ;
-
-// we actually don't need the arguments
-// we can just let this handler return a new Hitchhiker pointer
-Result* hitchhiker_handler( SEXP, const LazySubsets&, int ){
- return new Hitchhiker ;
-}
-
-// registration of the register, called from R, so exprted through Rcpp::export
-// [[Rcpp::export]]
-void registerHitchhiker(){
- registerHybridHandler( "hitchhiker", hitchhiker_handler );
-}
-
-/*** R
- require(dplyr)
- registerHitchhiker()
-
- n <- 10000
- df <- group_by( tbl_df( data.frame(
- id = sample( letters[1:4], 1000, replace = TRUE ),
- x = rnorm(n)
- ) ), id )
- summarise( df, y = hitchhiker() )
- # Source: local data frame [4 x 2]
- # Groups:
- #
- # id y
- # 1 a 42
- # 2 b 42
- # 3 c 42
- # 4 d 42
-
- summarise(df, y = mean(x) + hitchhiker())
- # Source: local data frame [4 x 2]
- # Groups:
- #
- # id y
- # 1 a 42.00988
- # 2 b 42.00988
- # 3 c 42.01440
- # 4 d 41.99160
-*/
-```
-
-## Registering hybrid handlers with a package
-
-To register custom handlers in packages, the best place is the `init` entry point
-that R automatically calls when a package is loaded.
-
-Instead of exposing the `registerHitchhiker` function as above, packages would typically
-register handlers like this:
-
-```
-#include <Rcpp.h>
-#include <dplyr.h>
-
-// R automatically calls this function when the maypack package is loaded.
-extern "C" void R_init_mypack( DllInfo* info ){
- registerHybridHandler( "hitchhiker", hitchhiker_handler );
-}
-```
-
-For this your package must know about Rcpp and dplyr's headers, which requires
-this information in the `DESCRIPTION` file:
-
-```
-LinkingTo: Rcpp, dplyr, BH
-```
-
-The `Makevars` and `Makevars.win` are similar to those used for any package
-that uses `Rcpp` features. See the `Rcpp` vignettes for details.
-
-
diff --git a/inst/doc/hybrid-evaluation.html b/vignettes/hybrid-evaluation.html
similarity index 97%
rename from inst/doc/hybrid-evaluation.html
rename to vignettes/hybrid-evaluation.html
index 307c989..d109982 100644
--- a/inst/doc/hybrid-evaluation.html
+++ b/vignettes/hybrid-evaluation.html
@@ -11,7 +11,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1">
-<meta name="date" content="2016-06-23" />
+<meta name="date" content="2016-10-27" />
<title>Hybrid evaluation</title>
@@ -68,7 +68,7 @@ code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Inf
<h1 class="title toc-ignore">Hybrid evaluation</h1>
-<h4 class="date"><em>2016-06-23</em></h4>
+<h4 class="date"><em>2016-10-27</em></h4>
@@ -170,7 +170,7 @@ code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Inf
<div id="retrieving-hybrid-handlers" class="section level2">
<h2>Retrieving hybrid handlers</h2>
<p><code>dplyr</code> functions use the <code>get_handler</code> function to retrieve handlers for particular expressions.</p>
-<div class="sourceCode"><pre class="sourceCode cpp"><code class="sourceCode cpp">Result* get_handler( SEXP call, <span class="dt">const</span> LazySubsets& subsets ){
+<div class="sourceCode"><pre class="sourceCode cpp"><code class="sourceCode cpp">Result* get_handler( SEXP call, <span class="dt">const</span> ILazySubsets& subsets ){
<span class="dt">int</span> depth = Rf_length(call) ;
HybridHandlerMap& handlers = get_handlers() ;
SEXP fun_symbol = CAR(call) ;
@@ -182,18 +182,18 @@ code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Inf
<span class="kw">return</span> it->second( call, subsets, depth - <span class="dv">1</span> );
}</code></pre></div>
<p>The <code>get_handler</code> performs a lookup in a hash table of type <code>HybridHandlerMap</code>.</p>
-<div class="sourceCode"><pre class="sourceCode cpp"><code class="sourceCode cpp"><span class="kw">typedef</span> dplyr::Result* (*HybridHandler)(SEXP, <span class="dt">const</span> dplyr::LazySubsets&, <span class="dt">int</span>) ;
+<div class="sourceCode"><pre class="sourceCode cpp"><code class="sourceCode cpp"><span class="kw">typedef</span> dplyr::Result* (*HybridHandler)(SEXP, <span class="dt">const</span> dplyr::ILazySubsets&, <span class="dt">int</span>) ;
<span class="kw">typedef</span> dplyr_hash_map<SEXP,HybridHandler> HybridHandlerMap ;</code></pre></div>
<p><code>HybridHandlerMap</code> is simply a hash map where the map key is the symbol of the function and the map value type is a function pointer defined by <code>HybridHandler</code>.</p>
<p>The parameters of the <code>HybridHandler</code> function pointer type are:</p>
<ul>
<li>The call we want to hybridify, e.g. something like <code>sum(flights)</code></li>
-<li>a <code>LazySubsets</code> reference. The only thing that is relevant about this class here is that it defines a <code>get_variable</code> method that takes a symbol <code>SEXP</code> and returns the corresponding variable from the data frame.</li>
+<li>a <code>ILazySubsets</code> reference. The only thing that is relevant about this class here is that it defines a <code>get_variable</code> method that takes a symbol <code>SEXP</code> and returns the corresponding variable from the data frame.</li>
<li>The number of arguments of the call. For example for <code>sum(flights)</code>, the number of arguments is <code>1</code>.</li>
</ul>
<p>The purpose of the hybrid handler function is to return a <code>Result*</code> if it can handle the call or 0 if it cannot.</p>
<p>with our previous <code>Sum</code> template class, we could define a hybrid handler function like this:</p>
-<div class="sourceCode"><pre class="sourceCode cpp"><code class="sourceCode cpp">Result* sum_handler(SEXP call, <span class="dt">const</span> LazySubsets& subsets, <span class="dt">int</span> nargs ){
+<div class="sourceCode"><pre class="sourceCode cpp"><code class="sourceCode cpp">Result* sum_handler(SEXP call, <span class="dt">const</span> ILazySubsets& subsets, <span class="dt">int</span> nargs ){
<span class="co">// we only know how to deal with argument</span>
<span class="kw">if</span>( nargs != <span class="dv">1</span> ) <span class="kw">return</span> <span class="dv">0</span> ;
@@ -243,7 +243,7 @@ code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Inf
<span class="co">// we actually don't need the arguments</span>
<span class="co">// we can just let this handler return a new Hitchhiker pointer</span>
-Result* hitchhiker_handler( SEXP, <span class="dt">const</span> LazySubsets&, <span class="dt">int</span> ){
+Result* hitchhiker_handler( SEXP, <span class="dt">const</span> ILazySubsets&, <span class="dt">int</span> ){
<span class="kw">return</span> <span class="kw">new</span> Hitchhiker ;
}
diff --git a/inst/doc/hybrid-evaluation.Rmd b/vignettes/internals/hybrid-evaluation.Rmd
similarity index 96%
rename from inst/doc/hybrid-evaluation.Rmd
rename to vignettes/internals/hybrid-evaluation.Rmd
index 8135895..cf9f8c1 100644
--- a/inst/doc/hybrid-evaluation.Rmd
+++ b/vignettes/internals/hybrid-evaluation.Rmd
@@ -1,6 +1,5 @@
---
title: "Hybrid evaluation"
-date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Hybrid evaluation}
@@ -170,7 +169,7 @@ internal implementation in `dplyr`.
particular expressions.
```cpp
-Result* get_handler( SEXP call, const LazySubsets& subsets ){
+Result* get_handler( SEXP call, const ILazySubsets& subsets ){
int depth = Rf_length(call) ;
HybridHandlerMap& handlers = get_handlers() ;
SEXP fun_symbol = CAR(call) ;
@@ -186,7 +185,7 @@ Result* get_handler( SEXP call, const LazySubsets& subsets ){
The `get_handler` performs a lookup in a hash table of type `HybridHandlerMap`.
```cpp
-typedef dplyr::Result* (*HybridHandler)(SEXP, const dplyr::LazySubsets&, int) ;
+typedef dplyr::Result* (*HybridHandler)(SEXP, const dplyr::ILazySubsets&, int) ;
typedef dplyr_hash_map<SEXP,HybridHandler> HybridHandlerMap ;
```
@@ -196,7 +195,7 @@ function and the map value type is a function pointer defined by `HybridHandler`
The parameters of the `HybridHandler` function pointer type are:
- The call we want to hybridify, e.g. something like `sum(flights)`
- - a `LazySubsets` reference. The only thing that is relevant about
+ - a `ILazySubsets` reference. The only thing that is relevant about
this class here is that it defines a `get_variable` method that takes a
symbol `SEXP` and returns the corresponding variable from the data frame.
- The number of arguments of the call. For example for `sum(flights)`, the number
@@ -209,7 +208,7 @@ with our previous `Sum` template class, we could define a hybrid handler functio
like this:
```cpp
-Result* sum_handler(SEXP call, const LazySubsets& subsets, int nargs ){
+Result* sum_handler(SEXP call, const ILazySubsets& subsets, int nargs ){
// we only know how to deal with argument
if( nargs != 1 ) return 0 ;
@@ -272,7 +271,7 @@ public:
// we actually don't need the arguments
// we can just let this handler return a new Hitchhiker pointer
-Result* hitchhiker_handler( SEXP, const LazySubsets&, int ){
+Result* hitchhiker_handler( SEXP, const ILazySubsets&, int ){
return new Hitchhiker ;
}
diff --git a/vignettes/introduction.html b/vignettes/introduction.html
new file mode 100644
index 0000000..fca998b
--- /dev/null
+++ b/vignettes/introduction.html
@@ -0,0 +1,567 @@
+<!DOCTYPE html>
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+
+<head>
+
+<meta charset="utf-8">
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+<meta name="generator" content="pandoc" />
+
+<meta name="viewport" content="width=device-width, initial-scale=1">
+
+
+<meta name="date" content="2016-10-27" />
+
+<title>Introduction to dplyr</title>
+
+
+
+<style type="text/css">code{white-space: pre;}</style>
+<style type="text/css">
+div.sourceCode { overflow-x: auto; }
+table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode {
+ margin: 0; padding: 0; vertical-align: baseline; border: none; }
+table.sourceCode { width: 100%; line-height: 100%; }
+td.lineNumbers { text-align: right; padding-right: 4px; padding-left: 4px; color: #aaaaaa; border-right: 1px solid #aaaaaa; }
+td.sourceCode { padding-left: 5px; }
+code > span.kw { color: #007020; font-weight: bold; } /* Keyword */
+code > span.dt { color: #902000; } /* DataType */
+code > span.dv { color: #40a070; } /* DecVal */
+code > span.bn { color: #40a070; } /* BaseN */
+code > span.fl { color: #40a070; } /* Float */
+code > span.ch { color: #4070a0; } /* Char */
+code > span.st { color: #4070a0; } /* String */
+code > span.co { color: #60a0b0; font-style: italic; } /* Comment */
+code > span.ot { color: #007020; } /* Other */
+code > span.al { color: #ff0000; font-weight: bold; } /* Alert */
+code > span.fu { color: #06287e; } /* Function */
+code > span.er { color: #ff0000; font-weight: bold; } /* Error */
+code > span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
+code > span.cn { color: #880000; } /* Constant */
+code > span.sc { color: #4070a0; } /* SpecialChar */
+code > span.vs { color: #4070a0; } /* VerbatimString */
+code > span.ss { color: #bb6688; } /* SpecialString */
+code > span.im { } /* Import */
+code > span.va { color: #19177c; } /* Variable */
+code > span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
+code > span.op { color: #666666; } /* Operator */
+code > span.bu { } /* BuiltIn */
+code > span.ex { } /* Extension */
+code > span.pp { color: #bc7a00; } /* Preprocessor */
+code > span.at { color: #7d9029; } /* Attribute */
+code > span.do { color: #ba2121; font-style: italic; } /* Documentation */
+code > span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
+code > span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
+code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
+</style>
+
+
+
+<link href="data:text/css;charset=utf-8,body%20%7B%0Abackground%2Dcolor%3A%20%23fff%3B%0Amargin%3A%201em%20auto%3B%0Amax%2Dwidth%3A%20700px%3B%0Aoverflow%3A%20visible%3B%0Apadding%2Dleft%3A%202em%3B%0Apadding%2Dright%3A%202em%3B%0Afont%2Dfamily%3A%20%22Open%20Sans%22%2C%20%22Helvetica%20Neue%22%2C%20Helvetica%2C%20Arial%2C%20sans%2Dserif%3B%0Afont%2Dsize%3A%2014px%3B%0Aline%2Dheight%3A%201%2E35%3B%0A%7D%0A%23header%20%7B%0Atext%2Dalign%3A%20center%3B%0A%7D%0A%23TOC%20%7B%0Aclear%3A%20bot [...]
+
+</head>
+
+<body>
+
+
+
+
+<h1 class="title toc-ignore">Introduction to dplyr</h1>
+<h4 class="date"><em>2016-10-27</em></h4>
+
+
+
+<p>When working with data you must:</p>
+<ul>
+<li><p>Figure out what you want to do.</p></li>
+<li><p>Describe those tasks in the form of a computer program.</p></li>
+<li><p>Execute the program.</p></li>
+</ul>
+<p>The dplyr package makes these steps fast and easy:</p>
+<ul>
+<li><p>By constraining your options, it simplifies how you can think about common data manipulation tasks.</p></li>
+<li><p>It provides simple “verbs”, functions that correspond to the most common data manipulation tasks, to help you translate those thoughts into code.</p></li>
+<li><p>It uses efficient data storage backends, so you spend less time waiting for the computer.</p></li>
+</ul>
+<p>This document introduces you to dplyr’s basic set of tools, and shows you how to apply them to data frames. Other vignettes provide more details on specific topics:</p>
+<ul>
+<li><p>databases: Besides in-memory data frames, dplyr also connects to out-of-memory, remote databases. And by translating your R code into the appropriate SQL, it allows you to work with both types of data using the same set of tools.</p></li>
+<li><p>benchmark-baseball: see how dplyr compares to other tools for data manipulation on a realistic use case.</p></li>
+<li><p>window-functions: a window function is a variation on an aggregation function. Where an aggregate function uses <code>n</code> inputs to produce 1 output, a window function uses <code>n</code> inputs to produce <code>n</code> outputs.</p></li>
+</ul>
+<div id="data-nycflights13" class="section level2">
+<h2>Data: nycflights13</h2>
+<p>To explore the basic data manipulation verbs of dplyr, we’ll start with the built in <code>nycflights13</code> data frame. This dataset contains all 336776 flights that departed from New York City in 2013. The data comes from the US <a href="http://www.transtats.bts.gov/DatabaseInfo.asp?DB_ID=120&Link=0">Bureau of Transportation Statistics</a>, and is documented in <code>?nycflights13</code></p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(nycflights13)
+<span class="kw">dim</span>(flights)
+<span class="co">#> [1] 336776 19</span>
+<span class="kw">head</span>(flights)
+<span class="co">#> # A tibble: 6 × 19</span>
+<span class="co">#> year month day dep_time sched_dep_time dep_delay arr_time</span>
+<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int></span>
+<span class="co">#> 1 2013 1 1 517 515 2 830</span>
+<span class="co">#> 2 2013 1 1 533 529 4 850</span>
+<span class="co">#> 3 2013 1 1 542 540 2 923</span>
+<span class="co">#> 4 2013 1 1 544 545 -1 1004</span>
+<span class="co">#> # ... with 2 more rows, and 12 more variables: sched_arr_time <int>,</span>
+<span class="co">#> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,</span>
+<span class="co">#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,</span>
+<span class="co">#> # minute <dbl>, time_hour <dttm></span></code></pre></div>
+<p>dplyr can work with data frames as is, but if you’re dealing with large data, it’s worthwhile to convert them to a <code>tbl_df</code>: this is a wrapper around a data frame that won’t accidentally print a lot of data to the screen.</p>
+</div>
+<div id="single-table-verbs" class="section level2">
+<h2>Single table verbs</h2>
+<p>Dplyr aims to provide a function for each basic verb of data manipulation:</p>
+<ul>
+<li><code>filter()</code> (and <code>slice()</code>)</li>
+<li><code>arrange()</code></li>
+<li><code>select()</code> (and <code>rename()</code>)</li>
+<li><code>distinct()</code></li>
+<li><code>mutate()</code> (and <code>transmute()</code>)</li>
+<li><code>summarise()</code></li>
+<li><code>sample_n()</code> (and <code>sample_frac()</code>)</li>
+</ul>
+<p>If you’ve used plyr before, many of these will be familar.</p>
+</div>
+<div id="filter-rows-with-filter" class="section level2">
+<h2>Filter rows with <code>filter()</code></h2>
+<p><code>filter()</code> allows you to select a subset of rows in a data frame. The first argument is the name of the data frame. The second and subsequent arguments are the expressions that filter the data frame:</p>
+<p>For example, we can select all flights on January 1st with:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">filter</span>(flights, month ==<span class="st"> </span><span class="dv">1</span>, day ==<span class="st"> </span><span class="dv">1</span>)
+<span class="co">#> # A tibble: 842 × 19</span>
+<span class="co">#> year month day dep_time sched_dep_time dep_delay arr_time</span>
+<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int></span>
+<span class="co">#> 1 2013 1 1 517 515 2 830</span>
+<span class="co">#> 2 2013 1 1 533 529 4 850</span>
+<span class="co">#> 3 2013 1 1 542 540 2 923</span>
+<span class="co">#> 4 2013 1 1 544 545 -1 1004</span>
+<span class="co">#> # ... with 838 more rows, and 12 more variables: sched_arr_time <int>,</span>
+<span class="co">#> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,</span>
+<span class="co">#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,</span>
+<span class="co">#> # minute <dbl>, time_hour <dttm></span></code></pre></div>
+<p>This is equivalent to the more verbose code in base R:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights[flights$month ==<span class="st"> </span><span class="dv">1</span> &<span class="st"> </span>flights$day ==<span class="st"> </span><span class="dv">1</span>, ]</code></pre></div>
+<p><code>filter()</code> works similarly to <code>subset()</code> except that you can give it any number of filtering conditions, which are joined together with <code>&</code> (not <code>&&</code> which is easy to do accidentally!). You can also use other boolean operators:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">filter</span>(flights, month ==<span class="st"> </span><span class="dv">1</span> |<span class="st"> </span>month ==<span class="st"> </span><span class="dv">2</span>)</code></pre></div>
+<p>To select rows by position, use <code>slice()</code>:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">slice</span>(flights, <span class="dv">1</span>:<span class="dv">10</span>)
+<span class="co">#> # A tibble: 10 × 19</span>
+<span class="co">#> year month day dep_time sched_dep_time dep_delay arr_time</span>
+<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int></span>
+<span class="co">#> 1 2013 1 1 517 515 2 830</span>
+<span class="co">#> 2 2013 1 1 533 529 4 850</span>
+<span class="co">#> 3 2013 1 1 542 540 2 923</span>
+<span class="co">#> 4 2013 1 1 544 545 -1 1004</span>
+<span class="co">#> # ... with 6 more rows, and 12 more variables: sched_arr_time <int>,</span>
+<span class="co">#> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,</span>
+<span class="co">#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,</span>
+<span class="co">#> # minute <dbl>, time_hour <dttm></span></code></pre></div>
+</div>
+<div id="arrange-rows-with-arrange" class="section level2">
+<h2>Arrange rows with <code>arrange()</code></h2>
+<p><code>arrange()</code> works similarly to <code>filter()</code> except that instead of filtering or selecting rows, it reorders them. It takes a data frame, and a set of column names (or more complicated expressions) to order by. If you provide more than one column name, each additional column will be used to break ties in the values of preceding columns:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">arrange</span>(flights, year, month, day)
+<span class="co">#> # A tibble: 336,776 × 19</span>
+<span class="co">#> year month day dep_time sched_dep_time dep_delay arr_time</span>
+<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int></span>
+<span class="co">#> 1 2013 1 1 517 515 2 830</span>
+<span class="co">#> 2 2013 1 1 533 529 4 850</span>
+<span class="co">#> 3 2013 1 1 542 540 2 923</span>
+<span class="co">#> 4 2013 1 1 544 545 -1 1004</span>
+<span class="co">#> # ... with 336,772 more rows, and 12 more variables: sched_arr_time <int>,</span>
+<span class="co">#> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,</span>
+<span class="co">#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,</span>
+<span class="co">#> # minute <dbl>, time_hour <dttm></span></code></pre></div>
+<p>Use <code>desc()</code> to order a column in descending order:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">arrange</span>(flights, <span class="kw">desc</span>(arr_delay))
+<span class="co">#> # A tibble: 336,776 × 19</span>
+<span class="co">#> year month day dep_time sched_dep_time dep_delay arr_time</span>
+<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int></span>
+<span class="co">#> 1 2013 1 9 641 900 1301 1242</span>
+<span class="co">#> 2 2013 6 15 1432 1935 1137 1607</span>
+<span class="co">#> 3 2013 1 10 1121 1635 1126 1239</span>
+<span class="co">#> 4 2013 9 20 1139 1845 1014 1457</span>
+<span class="co">#> # ... with 336,772 more rows, and 12 more variables: sched_arr_time <int>,</span>
+<span class="co">#> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,</span>
+<span class="co">#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,</span>
+<span class="co">#> # minute <dbl>, time_hour <dttm></span></code></pre></div>
+<p><code>dplyr::arrange()</code> works the same way as <code>plyr::arrange()</code>. It’s a straightforward wrapper around <code>order()</code> that requires less typing. The previous code is equivalent to:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights[<span class="kw">order</span>(flights$year, flights$month, flights$day), ]
+flights[<span class="kw">order</span>(flights$arr_delay, <span class="dt">decreasing =</span> <span class="ot">TRUE</span>), ] or flights[<span class="kw">order</span>(-flights$arr_delay), ]</code></pre></div>
+</div>
+<div id="select-columns-with-select" class="section level2">
+<h2>Select columns with <code>select()</code></h2>
+<p>Often you work with large datasets with many columns but only a few are actually of interest to you. <code>select()</code> allows you to rapidly zoom in on a useful subset using operations that usually only work on numeric variable positions:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Select columns by name</span>
+<span class="kw">select</span>(flights, year, month, day)
+<span class="co">#> # A tibble: 336,776 × 3</span>
+<span class="co">#> year month day</span>
+<span class="co">#> <int> <int> <int></span>
+<span class="co">#> 1 2013 1 1</span>
+<span class="co">#> 2 2013 1 1</span>
+<span class="co">#> 3 2013 1 1</span>
+<span class="co">#> 4 2013 1 1</span>
+<span class="co">#> # ... with 336,772 more rows</span>
+<span class="co"># Select all columns between year and day (inclusive)</span>
+<span class="kw">select</span>(flights, year:day)
+<span class="co">#> # A tibble: 336,776 × 3</span>
+<span class="co">#> year month day</span>
+<span class="co">#> <int> <int> <int></span>
+<span class="co">#> 1 2013 1 1</span>
+<span class="co">#> 2 2013 1 1</span>
+<span class="co">#> 3 2013 1 1</span>
+<span class="co">#> 4 2013 1 1</span>
+<span class="co">#> # ... with 336,772 more rows</span>
+<span class="co"># Select all columns except those from year to day (inclusive)</span>
+<span class="kw">select</span>(flights, -(year:day))
+<span class="co">#> # A tibble: 336,776 × 16</span>
+<span class="co">#> dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay</span>
+<span class="co">#> <int> <int> <dbl> <int> <int> <dbl></span>
+<span class="co">#> 1 517 515 2 830 819 11</span>
+<span class="co">#> 2 533 529 4 850 830 20</span>
+<span class="co">#> 3 542 540 2 923 850 33</span>
+<span class="co">#> 4 544 545 -1 1004 1022 -18</span>
+<span class="co">#> # ... with 336,772 more rows, and 10 more variables: carrier <chr>,</span>
+<span class="co">#> # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,</span>
+<span class="co">#> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm></span></code></pre></div>
+<p>This function works similarly to the <code>select</code> argument in <code>base::subset()</code>. Because the dplyr philosophy is to have small functions that do one thing well, it’s its own function in dplyr.</p>
+<p>There are a number of helper functions you can use within <code>select()</code>, like <code>starts_with()</code>, <code>ends_with()</code>, <code>matches()</code> and <code>contains()</code>. These let you quickly match larger blocks of variables that meet some criterion. See <code>?select</code> for more details.</p>
+<p>You can rename variables with <code>select()</code> by using named arguments:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">select</span>(flights, <span class="dt">tail_num =</span> tailnum)
+<span class="co">#> # A tibble: 336,776 × 1</span>
+<span class="co">#> tail_num</span>
+<span class="co">#> <chr></span>
+<span class="co">#> 1 N14228</span>
+<span class="co">#> 2 N24211</span>
+<span class="co">#> 3 N619AA</span>
+<span class="co">#> 4 N804JB</span>
+<span class="co">#> # ... with 336,772 more rows</span></code></pre></div>
+<p>But because <code>select()</code> drops all the variables not explicitly mentioned, it’s not that useful. Instead, use <code>rename()</code>:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">rename</span>(flights, <span class="dt">tail_num =</span> tailnum)
+<span class="co">#> # A tibble: 336,776 × 19</span>
+<span class="co">#> year month day dep_time sched_dep_time dep_delay arr_time</span>
+<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int></span>
+<span class="co">#> 1 2013 1 1 517 515 2 830</span>
+<span class="co">#> 2 2013 1 1 533 529 4 850</span>
+<span class="co">#> 3 2013 1 1 542 540 2 923</span>
+<span class="co">#> 4 2013 1 1 544 545 -1 1004</span>
+<span class="co">#> # ... with 336,772 more rows, and 12 more variables: sched_arr_time <int>,</span>
+<span class="co">#> # arr_delay <dbl>, carrier <chr>, flight <int>, tail_num <chr>,</span>
+<span class="co">#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,</span>
+<span class="co">#> # minute <dbl>, time_hour <dttm></span></code></pre></div>
+</div>
+<div id="extract-distinct-unique-rows" class="section level2">
+<h2>Extract distinct (unique) rows</h2>
+<p>Use <code>distinct()</code>to find unique values in a table:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">distinct</span>(flights, tailnum)
+<span class="co">#> # A tibble: 4,044 × 1</span>
+<span class="co">#> tailnum</span>
+<span class="co">#> <chr></span>
+<span class="co">#> 1 N14228</span>
+<span class="co">#> 2 N24211</span>
+<span class="co">#> 3 N619AA</span>
+<span class="co">#> 4 N804JB</span>
+<span class="co">#> # ... with 4,040 more rows</span>
+<span class="kw">distinct</span>(flights, origin, dest)
+<span class="co">#> # A tibble: 224 × 2</span>
+<span class="co">#> origin dest</span>
+<span class="co">#> <chr> <chr></span>
+<span class="co">#> 1 EWR IAH</span>
+<span class="co">#> 2 LGA IAH</span>
+<span class="co">#> 3 JFK MIA</span>
+<span class="co">#> 4 JFK BQN</span>
+<span class="co">#> # ... with 220 more rows</span></code></pre></div>
+<p>(This is very similar to <code>base::unique()</code> but should be much faster.)</p>
+</div>
+<div id="add-new-columns-with-mutate" class="section level2">
+<h2>Add new columns with <code>mutate()</code></h2>
+<p>Besides selecting sets of existing columns, it’s often useful to add new columns that are functions of existing columns. This is the job of <code>mutate()</code>:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">mutate</span>(flights,
+ <span class="dt">gain =</span> arr_delay -<span class="st"> </span>dep_delay,
+ <span class="dt">speed =</span> distance /<span class="st"> </span>air_time *<span class="st"> </span><span class="dv">60</span>)
+<span class="co">#> # A tibble: 336,776 × 21</span>
+<span class="co">#> year month day dep_time sched_dep_time dep_delay arr_time</span>
+<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int></span>
+<span class="co">#> 1 2013 1 1 517 515 2 830</span>
+<span class="co">#> 2 2013 1 1 533 529 4 850</span>
+<span class="co">#> 3 2013 1 1 542 540 2 923</span>
+<span class="co">#> 4 2013 1 1 544 545 -1 1004</span>
+<span class="co">#> # ... with 336,772 more rows, and 14 more variables: sched_arr_time <int>,</span>
+<span class="co">#> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,</span>
+<span class="co">#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,</span>
+<span class="co">#> # minute <dbl>, time_hour <dttm>, gain <dbl>, speed <dbl></span></code></pre></div>
+<p><code>dplyr::mutate()</code> works the same way as <code>plyr::mutate()</code> and similarly to <code>base::transform()</code>. The key difference between <code>mutate()</code> and <code>transform()</code> is that mutate allows you to refer to columns that you’ve just created:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">mutate</span>(flights,
+ <span class="dt">gain =</span> arr_delay -<span class="st"> </span>dep_delay,
+ <span class="dt">gain_per_hour =</span> gain /<span class="st"> </span>(air_time /<span class="st"> </span><span class="dv">60</span>)
+)
+<span class="co">#> # A tibble: 336,776 × 21</span>
+<span class="co">#> year month day dep_time sched_dep_time dep_delay arr_time</span>
+<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int></span>
+<span class="co">#> 1 2013 1 1 517 515 2 830</span>
+<span class="co">#> 2 2013 1 1 533 529 4 850</span>
+<span class="co">#> 3 2013 1 1 542 540 2 923</span>
+<span class="co">#> 4 2013 1 1 544 545 -1 1004</span>
+<span class="co">#> # ... with 336,772 more rows, and 14 more variables: sched_arr_time <int>,</span>
+<span class="co">#> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,</span>
+<span class="co">#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,</span>
+<span class="co">#> # minute <dbl>, time_hour <dttm>, gain <dbl>, gain_per_hour <dbl></span></code></pre></div>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">transform</span>(flights,
+ <span class="dt">gain =</span> arr_delay -<span class="st"> </span>delay,
+ <span class="dt">gain_per_hour =</span> gain /<span class="st"> </span>(air_time /<span class="st"> </span><span class="dv">60</span>)
+)
+<span class="co">#> Error: object 'gain' not found</span></code></pre></div>
+<p>If you only want to keep the new variables, use <code>transmute()</code>:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">transmute</span>(flights,
+ <span class="dt">gain =</span> arr_delay -<span class="st"> </span>dep_delay,
+ <span class="dt">gain_per_hour =</span> gain /<span class="st"> </span>(air_time /<span class="st"> </span><span class="dv">60</span>)
+)
+<span class="co">#> # A tibble: 336,776 × 2</span>
+<span class="co">#> gain gain_per_hour</span>
+<span class="co">#> <dbl> <dbl></span>
+<span class="co">#> 1 9 2.378855</span>
+<span class="co">#> 2 16 4.229075</span>
+<span class="co">#> 3 31 11.625000</span>
+<span class="co">#> 4 -17 -5.573770</span>
+<span class="co">#> # ... with 336,772 more rows</span></code></pre></div>
+</div>
+<div id="summarise-values-with-summarise" class="section level2">
+<h2>Summarise values with <code>summarise()</code></h2>
+<p>The last verb is <code>summarise()</code>. It collapses a data frame to a single row (this is exactly equivalent to <code>plyr::summarise()</code>):</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">summarise</span>(flights,
+ <span class="dt">delay =</span> <span class="kw">mean</span>(dep_delay, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>))
+<span class="co">#> # A tibble: 1 × 1</span>
+<span class="co">#> delay</span>
+<span class="co">#> <dbl></span>
+<span class="co">#> 1 12.63907</span></code></pre></div>
+<p>Below, we’ll see how this verb can be very useful.</p>
+</div>
+<div id="randomly-sample-rows-with-sample_n-and-sample_frac" class="section level2">
+<h2>Randomly sample rows with <code>sample_n()</code> and <code>sample_frac()</code></h2>
+<p>You can use <code>sample_n()</code> and <code>sample_frac()</code> to take a random sample of rows: use <code>sample_n()</code> for a fixed number and <code>sample_frac()</code> for a fixed fraction.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">sample_n</span>(flights, <span class="dv">10</span>)
+<span class="co">#> # A tibble: 10 × 19</span>
+<span class="co">#> year month day dep_time sched_dep_time dep_delay arr_time</span>
+<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int></span>
+<span class="co">#> 1 2013 10 8 800 800 0 904</span>
+<span class="co">#> 2 2013 12 25 1047 1055 -8 1243</span>
+<span class="co">#> 3 2013 11 22 814 756 18 1024</span>
+<span class="co">#> 4 2013 4 5 2033 1829 124 2250</span>
+<span class="co">#> # ... with 6 more rows, and 12 more variables: sched_arr_time <int>,</span>
+<span class="co">#> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,</span>
+<span class="co">#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,</span>
+<span class="co">#> # minute <dbl>, time_hour <dttm></span>
+<span class="kw">sample_frac</span>(flights, <span class="fl">0.01</span>)
+<span class="co">#> # A tibble: 3,368 × 19</span>
+<span class="co">#> year month day dep_time sched_dep_time dep_delay arr_time</span>
+<span class="co">#> <int> <int> <int> <int> <int> <dbl> <int></span>
+<span class="co">#> 1 2013 5 27 1823 1829 -6 2000</span>
+<span class="co">#> 2 2013 1 21 740 745 -5 916</span>
+<span class="co">#> 3 2013 9 17 641 640 1 747</span>
+<span class="co">#> 4 2013 3 13 1132 1137 -5 1321</span>
+<span class="co">#> # ... with 3,364 more rows, and 12 more variables: sched_arr_time <int>,</span>
+<span class="co">#> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,</span>
+<span class="co">#> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,</span>
+<span class="co">#> # minute <dbl>, time_hour <dttm></span></code></pre></div>
+<p>Use <code>replace = TRUE</code> to perform a bootstrap sample. If needed, you can weight the sample with the <code>weight</code> argument.</p>
+</div>
+<div id="commonalities" class="section level2">
+<h2>Commonalities</h2>
+<p>You may have noticed that the syntax and function of all these verbs are very similar:</p>
+<ul>
+<li><p>The first argument is a data frame.</p></li>
+<li><p>The subsequent arguments describe what to do with the data frame. Notice that you can refer to columns in the data frame directly without using <code>$</code>.</p></li>
+<li><p>The result is a new data frame</p></li>
+</ul>
+<p>Together these properties make it easy to chain together multiple simple steps to achieve a complex result.</p>
+<p>These five functions provide the basis of a language of data manipulation. At the most basic level, you can only alter a tidy data frame in five useful ways: you can reorder the rows (<code>arrange()</code>), pick observations and variables of interest (<code>filter()</code> and <code>select()</code>), add new variables that are functions of existing variables (<code>mutate()</code>), or collapse many values to a summary (<code>summarise()</code>). The remainder of the language comes [...]
+</div>
+<div id="grouped-operations" class="section level1">
+<h1>Grouped operations</h1>
+<p>These verbs are useful on their own, but they become really powerful when you apply them to groups of observations within a dataset. In dplyr, you do this by with the <code>group_by()</code> function. It breaks down a dataset into specified groups of rows. When you then apply the verbs above on the resulting object they’ll be automatically applied “by group”. Most importantly, all this is achieved by using the same exact syntax you’d use with an ungrouped object.</p>
+<p>Grouping affects the verbs as follows:</p>
+<ul>
+<li><p>grouped <code>select()</code> is the same as ungrouped <code>select()</code>, except that grouping variables are always retained.</p></li>
+<li><p>grouped <code>arrange()</code> orders first by the grouping variables</p></li>
+<li><p><code>mutate()</code> and <code>filter()</code> are most useful in conjunction with window functions (like <code>rank()</code>, or <code>min(x) == x</code>). They are described in detail in <code>vignette("window-functions")</code>.</p></li>
+<li><p><code>sample_n()</code> and <code>sample_frac()</code> sample the specified number/fraction of rows in each group.</p></li>
+<li><p><code>slice()</code> extracts rows within each group.</p></li>
+<li><p><code>summarise()</code> is powerful and easy to understand, as described in more detail below.</p></li>
+</ul>
+<p>In the following example, we split the complete dataset into individual planes and then summarise each plane by counting the number of flights (<code>count = n()</code>) and computing the average distance (<code>dist = mean(Distance, na.rm = TRUE)</code>) and arrival delay (<code>delay = mean(ArrDelay, na.rm = TRUE)</code>). We then use ggplot2 to display the output.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">by_tailnum <-<span class="st"> </span><span class="kw">group_by</span>(flights, tailnum)
+delay <-<span class="st"> </span><span class="kw">summarise</span>(by_tailnum,
+ <span class="dt">count =</span> <span class="kw">n</span>(),
+ <span class="dt">dist =</span> <span class="kw">mean</span>(distance, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
+ <span class="dt">delay =</span> <span class="kw">mean</span>(arr_delay, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>))
+delay <-<span class="st"> </span><span class="kw">filter</span>(delay, count ><span class="st"> </span><span class="dv">20</span>, dist <<span class="st"> </span><span class="dv">2000</span>)
+
+<span class="co"># Interestingly, the average delay is only slightly related to the</span>
+<span class="co"># average distance flown by a plane.</span>
+<span class="kw">ggplot</span>(delay, <span class="kw">aes</span>(dist, delay)) +
+<span class="st"> </span><span class="kw">geom_point</span>(<span class="kw">aes</span>(<span class="dt">size =</span> count), <span class="dt">alpha =</span> <span class="dv">1</span>/<span class="dv">2</span>) +
+<span class="st"> </span><span class="kw">geom_smooth</span>() +
+<span class="st"> </span><span class="kw">scale_size_area</span>()</code></pre></div>
+<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAkAAAAEgCAIAAADT5FumAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAgAElEQVR4nOy9d0AU5xbwvTPbe6f3LioICiiigr2boGCNJYnGGJNrSdXk5iYxV43ReE28lmiiUYJ4LbFiQwQlKIgiKE06LCy7tF22t/n+eL53X14EpC0s+vz+gmFmnjPDzJznnOcUBMMwHAQCgUAggw10oAWAQCAQCKQnQAUGgUAgkEEJVGAQCAQCGZRABQaBQCCQQQlUYBAIBAIZlEAFBoFAIJBBCWGgBeiMlpaW/hmIRCLpdLr+GasHkMlkDMOsWUIrv4EUCsVgMBgMhoEWpH1QFEVR1JrFI5PJWq3WZDINtCztQyAQTCaTNYtHJBLVanXPDmcymX0rz6uEVSswrVbbPwOB97N/xuoBFAoFwzBrltDKbyCTydTpdFYrI [...]
+<p>You use <code>summarise()</code> with <strong>aggregate functions</strong>, which take a vector of values and return a single number. There are many useful examples of such functions in base R like <code>min()</code>, <code>max()</code>, <code>mean()</code>, <code>sum()</code>, <code>sd()</code>, <code>median()</code>, and <code>IQR()</code>. dplyr provides a handful of others:</p>
+<ul>
+<li><p><code>n()</code>: the number of observations in the current group</p></li>
+<li><p><code>n_distinct(x)</code>:the number of unique values in <code>x</code>.</p></li>
+<li><p><code>first(x)</code>, <code>last(x)</code> and <code>nth(x, n)</code> - these work similarly to <code>x[1]</code>, <code>x[length(x)]</code>, and <code>x[n]</code> but give you more control over the result if the value is missing.</p></li>
+</ul>
+<p>For example, we could use these to find the number of planes and the number of flights that go to each possible destination:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">destinations <-<span class="st"> </span><span class="kw">group_by</span>(flights, dest)
+<span class="kw">summarise</span>(destinations,
+ <span class="dt">planes =</span> <span class="kw">n_distinct</span>(tailnum),
+ <span class="dt">flights =</span> <span class="kw">n</span>()
+)
+<span class="co">#> # A tibble: 105 × 3</span>
+<span class="co">#> dest planes flights</span>
+<span class="co">#> <chr> <int> <int></span>
+<span class="co">#> 1 ABQ 108 254</span>
+<span class="co">#> 2 ACK 58 265</span>
+<span class="co">#> 3 ALB 172 439</span>
+<span class="co">#> 4 ANC 6 8</span>
+<span class="co">#> # ... with 101 more rows</span></code></pre></div>
+<p>You can also use any function that you write yourself. For performance, dplyr provides optimised C++ versions of many of these functions. If you want to provide your own C++ function, see the hybrid-evaluation vignette for more details.</p>
+<p>When you group by multiple variables, each summary peels off one level of the grouping. That makes it easy to progressively roll-up a dataset:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">daily <-<span class="st"> </span><span class="kw">group_by</span>(flights, year, month, day)
+(per_day <-<span class="st"> </span><span class="kw">summarise</span>(daily, <span class="dt">flights =</span> <span class="kw">n</span>()))
+<span class="co">#> Source: local data frame [365 x 4]</span>
+<span class="co">#> Groups: year, month [?]</span>
+<span class="co">#> </span>
+<span class="co">#> year month day flights</span>
+<span class="co">#> <int> <int> <int> <int></span>
+<span class="co">#> 1 2013 1 1 842</span>
+<span class="co">#> 2 2013 1 2 943</span>
+<span class="co">#> 3 2013 1 3 914</span>
+<span class="co">#> 4 2013 1 4 915</span>
+<span class="co">#> # ... with 361 more rows</span>
+(per_month <-<span class="st"> </span><span class="kw">summarise</span>(per_day, <span class="dt">flights =</span> <span class="kw">sum</span>(flights)))
+<span class="co">#> Source: local data frame [12 x 3]</span>
+<span class="co">#> Groups: year [?]</span>
+<span class="co">#> </span>
+<span class="co">#> year month flights</span>
+<span class="co">#> <int> <int> <int></span>
+<span class="co">#> 1 2013 1 27004</span>
+<span class="co">#> 2 2013 2 24951</span>
+<span class="co">#> 3 2013 3 28834</span>
+<span class="co">#> 4 2013 4 28330</span>
+<span class="co">#> # ... with 8 more rows</span>
+(per_year <-<span class="st"> </span><span class="kw">summarise</span>(per_month, <span class="dt">flights =</span> <span class="kw">sum</span>(flights)))
+<span class="co">#> # A tibble: 1 × 2</span>
+<span class="co">#> year flights</span>
+<span class="co">#> <int> <int></span>
+<span class="co">#> 1 2013 336776</span></code></pre></div>
+<p>However you need to be careful when progressively rolling up summaries like this: it’s ok for sums and counts, but you need to think about weighting for means and variances (it’s not possible to do this exactly for medians).</p>
+<div id="chaining" class="section level2">
+<h2>Chaining</h2>
+<p>The dplyr API is functional in the sense that function calls don’t have side-effects. You must always save their results. This doesn’t lead to particularly elegant code, especially if you want to do many operations at once. You either have to do it step-by-step:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">a1 <-<span class="st"> </span><span class="kw">group_by</span>(flights, year, month, day)
+a2 <-<span class="st"> </span><span class="kw">select</span>(a1, arr_delay, dep_delay)
+a3 <-<span class="st"> </span><span class="kw">summarise</span>(a2,
+ <span class="dt">arr =</span> <span class="kw">mean</span>(arr_delay, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
+ <span class="dt">dep =</span> <span class="kw">mean</span>(dep_delay, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>))
+a4 <-<span class="st"> </span><span class="kw">filter</span>(a3, arr ><span class="st"> </span><span class="dv">30</span> |<span class="st"> </span>dep ><span class="st"> </span><span class="dv">30</span>)</code></pre></div>
+<p>Or if you don’t want to save the intermediate results, you need to wrap the function calls inside each other:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">filter</span>(
+ <span class="kw">summarise</span>(
+ <span class="kw">select</span>(
+ <span class="kw">group_by</span>(flights, year, month, day),
+ arr_delay, dep_delay
+ ),
+ <span class="dt">arr =</span> <span class="kw">mean</span>(arr_delay, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
+ <span class="dt">dep =</span> <span class="kw">mean</span>(dep_delay, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>)
+ ),
+ arr ><span class="st"> </span><span class="dv">30</span> |<span class="st"> </span>dep ><span class="st"> </span><span class="dv">30</span>
+)
+<span class="co">#> Adding missing grouping variables: `year`, `month`, `day`</span>
+<span class="co">#> Source: local data frame [49 x 5]</span>
+<span class="co">#> Groups: year, month [11]</span>
+<span class="co">#> </span>
+<span class="co">#> year month day arr dep</span>
+<span class="co">#> <int> <int> <int> <dbl> <dbl></span>
+<span class="co">#> 1 2013 1 16 34.24736 24.61287</span>
+<span class="co">#> 2 2013 1 31 32.60285 28.65836</span>
+<span class="co">#> 3 2013 2 11 36.29009 39.07360</span>
+<span class="co">#> 4 2013 2 27 31.25249 37.76327</span>
+<span class="co">#> # ... with 45 more rows</span></code></pre></div>
+<p>This is difficult to read because the order of the operations is from inside to out. Thus, the arguments are a long way away from the function. To get around this problem, dplyr provides the <code>%>%</code> operator. <code>x %>% f(y)</code> turns into <code>f(x, y)</code> so you can use it to rewrite multiple operations that you can read left-to-right, top-to-bottom:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights %>%
+<span class="st"> </span><span class="kw">group_by</span>(year, month, day) %>%
+<span class="st"> </span><span class="kw">select</span>(arr_delay, dep_delay) %>%
+<span class="st"> </span><span class="kw">summarise</span>(
+ <span class="dt">arr =</span> <span class="kw">mean</span>(arr_delay, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
+ <span class="dt">dep =</span> <span class="kw">mean</span>(dep_delay, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>)
+ ) %>%
+<span class="st"> </span><span class="kw">filter</span>(arr ><span class="st"> </span><span class="dv">30</span> |<span class="st"> </span>dep ><span class="st"> </span><span class="dv">30</span>)</code></pre></div>
+</div>
+</div>
+<div id="other-data-sources" class="section level1">
+<h1>Other data sources</h1>
+<p>As well as data frames, dplyr works with data that is stored in other ways, like data tables, databases and multidimensional arrays.</p>
+<div id="data-table" class="section level2">
+<h2>Data table</h2>
+<p>dplyr also provides <a href="http://datatable.r-forge.r-project.org/">data table</a> methods for all verbs through <a href="http://github.com/hadley/dtplyr">dtplyr</a>. If you’re using data.tables already this lets you to use dplyr syntax for data manipulation, and data.table for everything else.</p>
+<p>For multiple operations, data.table can be faster because you usually use it with multiple verbs simultaneously. For example, with data table you can do a mutate and a select in a single step. It’s smart enough to know that there’s no point in computing the new variable for rows you’re about to throw away.</p>
+<p>The advantages of using dplyr with data tables are:</p>
+<ul>
+<li><p>For common data manipulation tasks, it insulates you from the reference semantics of data.tables, and protects you from accidentally modifying your data.</p></li>
+<li><p>Instead of one complex method built on the subscripting operator (<code>[</code>), it provides many simple methods.</p></li>
+</ul>
+</div>
+<div id="databases" class="section level2">
+<h2>Databases</h2>
+<p>dplyr also allows you to use the same verbs with a remote database. It takes care of generating the SQL for you so that you can avoid the cognitive challenge of constantly switching between languages. See the databases vignette for more details.</p>
+<p>Compared to DBI and the database connection algorithms:</p>
+<ul>
+<li>it hides, as much as possible, the fact that you’re working with a remote database</li>
+<li>you don’t need to know any SQL (although it helps!)</li>
+<li>it abstracts over the many differences between the different DBI implementations</li>
+</ul>
+</div>
+<div id="multidimensional-arrays-cubes" class="section level2">
+<h2>Multidimensional arrays / cubes</h2>
+<p><code>tbl_cube()</code> provides an experimental interface to multidimensional arrays or data cubes. If you’re using this form of data in R, please get in touch so I can better understand your needs.</p>
+</div>
+</div>
+<div id="comparisons" class="section level1">
+<h1>Comparisons</h1>
+<p>Compared to all existing options, dplyr:</p>
+<ul>
+<li><p>abstracts away how your data is stored, so that you can work with data frames, data tables and remote databases using the same set of functions. This lets you focus on what you want to achieve, not on the logistics of data storage.</p></li>
+<li><p>provides a thoughtful default <code>print()</code> method that doesn’t automatically print pages of data to the screen (this was inspired by data table’s output).</p></li>
+</ul>
+<p>Compared to base functions:</p>
+<ul>
+<li><p>dplyr is much more consistent; functions have the same interface. So once you’ve mastered one, you can easily pick up the others</p></li>
+<li><p>base functions tend to be based around vectors; dplyr is based around data frames</p></li>
+</ul>
+<p>Compared to plyr, dplyr:</p>
+<ul>
+<li><p>is much much faster</p></li>
+<li><p>provides a better thought out set of joins</p></li>
+<li><p>only provides tools for working with data frames (e.g. most of dplyr is equivalent to <code>ddply()</code> + various functions, <code>do()</code> is equivalent to <code>dlply()</code>)</p></li>
+</ul>
+<p>Compared to virtual data frame approaches:</p>
+<ul>
+<li><p>it doesn’t pretend that you have a data frame: if you want to run lm etc, you’ll still need to manually pull down the data</p></li>
+<li><p>it doesn’t provide methods for R summary functions (e.g. <code>mean()</code>, or <code>sum()</code>)</p></li>
+</ul>
+</div>
+
+
+
+<!-- dynamically load mathjax for compatibility with self-contained -->
+<script>
+ (function () {
+ var script = document.createElement("script");
+ script.type = "text/javascript";
+ script.src = "https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
+ document.getElementsByTagName("head")[0].appendChild(script);
+ })();
+</script>
+
+</body>
+</html>
diff --git a/vignettes/new-sql-backend.Rmd b/vignettes/new-sql-backend.Rmd
deleted file mode 100644
index 8d2f82f..0000000
--- a/vignettes/new-sql-backend.Rmd
+++ /dev/null
@@ -1,123 +0,0 @@
----
-title: "Adding a new SQL backend"
-date: "`r Sys.Date()`"
-output: rmarkdown::html_vignette
-vignette: >
- %\VignetteIndexEntry{Adding a new SQL backend}
- %\VignetteEngine{knitr::rmarkdown}
- \usepackage[utf8]{inputenc}
----
-
-```{r, echo = FALSE, message = FALSE}
-knitr::opts_chunk$set(collapse = T, comment = "#>")
-options(tibble.print_min = 4L, tibble.print_max = 4L)
-library(dplyr)
-```
-
-This document describes how to add a new SQL backend to dplyr. To begin:
-
-* Ensure that you have a DBI compliant database backend. If not, you'll need
- to first create it by following the instructions in
- `vignette("backend", package = "DBI")`.
-
-* You'll need a working knowledge of S3. Make sure that you're
- [familiar with the basics](http://adv-r.had.co.nz/OO-essentials.html#s3)
- before you start.
-
-This document is still a work in progress, but it will hopefully get you started. If you're familiar with how your database and at least one other database that dplyr supports already, this should be reasonably simple task. However, it is possible that a new database backend may need new methods - I'm happy to add those as needed.
-
-## Create the src object
-
-Start by creating a new src function to represent the backend. Assuming we're going to create a src for postgres, you'd call it `src_postgres()`, and you'd follow the pattern of an existing src. A simplified version of `src_postgres()` is show below:
-
-```{r, eval = FALSE}
-src_postgres <- function(dbname = NULL, host = NULL, port = NULL, user = NULL,
- password = NULL, ...) {
-
- con <- dbConnect(PostgreSQL(), host = host %||% "", dbname = dbname %||% "",
- user = user, password = password %||% "", port = port %||% "", ...)
-
- src_sql("postgres", con)
-}
-```
-
-Use `src_sql()` to create a new S3 object with the correct structure. It must have a DBI connection, but it can store anything else that might be useful.
-
-Next, implement a method for `src_desc()` that briefly describes the source:
-
-```{r}
-#' @export
-src_desc.src_postgres <- function(con) {
- info <- dbGetInfo(con)
- host <- if (info$host == "") "localhost" else info$host
-
- paste0("postgres ", info$serverVersion, " [", info$user, "@",
- host, ":", info$port, "/", info$dbname, "]")
-}
-```
-
-If you read the source code for the real `src_postgres()` you'll notice that it caches the `getGetInfo()` field on creation, since this saves a little time when printing tbls.
-
-Before continuing, check that you can create a connection to a local database, and that you get a listing of the existing tables. If you have a problem at this point, you may need to check the DBI backend.
-
-## tbl
-
-Next implement the `tbl()` method for your data source. This will probably just be:
-
-```{r, eval = FALSE}
-tbl.src_mssql <- function(src, from, ...) {
- tbl_sql("mssql", src = src, from = from, ...)
-}
-```
-
-Before continuing, make sure you can connect to an existing table, and that the results are printed reasonably. If not, that may indicate your database uses a non-standard DBI interface, and you'll need to fix that before continuing.
-
-This is also a good time implement `explain()`, by adding a method for `db_explain()`.
-
-If your database uses non-standard quoting (i.e. something other than `"` for identifiers and `'` for strings), implement methods for `sql_escape_string()` and `sql_escape_ident()`.
-
-You may need to implement `db_query_fields()`, which should return a character vector giving the field names of a query.
-
-At this point, all the basic verbs (`summarise()`, `filter()`, `arrange()`, `mutate()` etc) should also work, but it's hard to test without some data.
-
-## `copy_to()`
-
-Next, implement the methods that power `copy_to()` work. Once you've implemented these methods, you'll be able copy datasets from R into your database, which will make testing much easier.
-
-* `db_data_type()`
-* `sql_begin()`, `sql_commit()`, `sql_rollback()`
-* `sql_create_table()`, `sql_insert_into()`, `sql_drop_table()`
-* `sql_create_index()`, `sql_analyze()`
-
-If the database doesn't support a function, just return `TRUE` without doing anything. If you find these methods a very poor match to your backend, you may find it easier to provide a direct `copy_to()` method.
-
-At this point, you should be able to copy the nycflights13 data packages into your database with (e.g.):
-
-```{r, eval = FALSE}
-copy_nycflights13(src_mssql(...))
-copy_lahman(src_mssql(...))
-```
-
-Don't proceed further until this works, and you've verified that the basic single table verbs word.
-
-## Compute, collect and collapse
-
-Next, check that `collapse()`, `compute()`, and `collect()` work.
-
-* If `collapse()` fails, your database has a non-standard way of constructing
- subqueries. Add a method for `sql_subquery()`.
-
-* If `compute()` fails, your database has a non-standard way of saving queries
- in temporary tables. Add a method for `db_save_query()`.
-
-## Multi table verbs
-
-Next check the multitable verbs:
-
-* `left_join()`, `inner_join()`: powered by `sql_join()`
-* `semi_join()`, `anti_join()`: powered by `sql_semi_join()`
-* `union()`, `intersect()`, `setdiff()`: powered by `sql_set_op()`
-
-## sql translation
-
-To finish off, you can add custom R -> SQL translation by providing a method for `src_translate_env()`. This function should return an object created by `sql_variant()`. See existing methods for examples.
diff --git a/inst/doc/new-sql-backend.html b/vignettes/new-sql-backend.html
similarity index 99%
rename from inst/doc/new-sql-backend.html
rename to vignettes/new-sql-backend.html
index 6a2c6da..863d803 100644
--- a/inst/doc/new-sql-backend.html
+++ b/vignettes/new-sql-backend.html
@@ -11,7 +11,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1">
-<meta name="date" content="2016-06-23" />
+<meta name="date" content="2016-10-27" />
<title>Adding a new SQL backend</title>
@@ -68,7 +68,7 @@ code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Inf
<h1 class="title toc-ignore">Adding a new SQL backend</h1>
-<h4 class="date"><em>2016-06-23</em></h4>
+<h4 class="date"><em>2016-10-27</em></h4>
diff --git a/vignettes/notes/mysql-setup.Rmd b/vignettes/notes/mysql-setup.Rmd
deleted file mode 100644
index 40a9a0b..0000000
--- a/vignettes/notes/mysql-setup.Rmd
+++ /dev/null
@@ -1,40 +0,0 @@
-<!--
-%\VignetteEngine{knitr}
-%\VignetteIndexEntry{MySQL setup}
--->
-
-# Setting up MariaDB
-
-First install maria db, create a data directory, and create a default database.
-
-```
-# brew update
-# brew unlink postgresql
-brew install mariadb
-mysql_install_db --verbose --user=hadley --basedir=/usr/local \
- --datadir=/User/hadley/db/mariadb --tmpdir=/tmp
-
-mysqld --datadir='/Users/hadley/db/mysql'
-
-mysql -u root -p -e "CREATE DATABASE lahman;"
-mysql -u root -p -e "CREATE DATABASE nycflights13;"
-mysql -u root -p -e "CREATE DATABASE test;"
-```
-
-Then each time, run:
-
-```
-mysqld --datadir='/Users/hadley/db/mysql'
-mysqladmin shutdown -u root -p
-```
-
-From R,
-
-```{r, eval = FALSE}
-install.packages("RMySQL")
-library(RMySQL)
-
-drv <- dbDriver("MySQL")
-con <- dbConnect(drv, dbname = "lahman", username = "root", password = "")
-dbListTables(con)
-```
diff --git a/vignettes/notes/postgres-setup.Rmd b/vignettes/notes/postgres-setup.Rmd
deleted file mode 100644
index b8c2d31..0000000
--- a/vignettes/notes/postgres-setup.Rmd
+++ /dev/null
@@ -1,36 +0,0 @@
-<!--
-%\VignetteEngine{knitr}
-%\VignetteIndexEntry{PostgreSQL setup}
--->
-
-# Setting up Postgresql
-
-First install postgresql, create a data directory, and create a default database.
-
-```
-# brew update
-# brew unlink postgresql
-brew install postgresql
-export PGDATA=~/db/postgres-9.5 # set this globally somewhere
-initdb -E utf8
-createdb
-createdb lahman
-createdb nycflightd13
-```pos
-
-Then on reboot run
-
-```
-pg_ctl start
-```
-
-## Check connection
-
-From R,
-
-```{r, eval = FALSE}
-install.packages("RPostgreSQL")
-library(DBI)
-con <- dbConnect(RPostgreSQL::PostgreSQL(), dbname = "hadley")
-dbListTables(con)
-```
diff --git a/vignettes/notes/vagrant-setup.Rmd b/vignettes/notes/vagrant-setup.Rmd
deleted file mode 100644
index 0eca4a4..0000000
--- a/vignettes/notes/vagrant-setup.Rmd
+++ /dev/null
@@ -1,42 +0,0 @@
-<!--
-%\VignetteEngine{knitr}
-%\VignetteIndexEntry{Vagrant setup}
--->
-
-# Vagrant set up
-
-Run this code to set up databases on vagrant vm.
-https://gist.github.com/trestletech/b5babf76487fa6c898b2
-
-```
-# Install needed apt packages
-sudo apt-get install postgresql libpq-dev
-sudo apt-get install mysql-server libmysqlclient-dev
-
-# Set up database users
-sudo su -
-sudo -u postgres createuser --superuser Ruser
-sudo -u postgres psql
-\password Ruser
-# press enter twice
-# Ctrl + D
-
-mysql -u root -p -e "CREATE DATABASE lahman;"
-mysql -u root -p -e "CREATE DATABASE nycflights13;"
-mysql -u root -p -e "CREATE DATABASE test;"
-
-sudo -u postgres createdb lahman
-sudo -u postgres createdb nycflights13
-sudo -u postgres createdb test
-# Ctrl + D
-
-R
-install.packages(c("RPostgreSQL", "RMySQL"))
-# Ctrl + D
-
-library(dplyr)
-lahman_mysql()
-lahman_postgres()
-nycflights13_postgres()
-
-```
diff --git a/vignettes/nse.Rmd b/vignettes/nse.Rmd
deleted file mode 100644
index 44d0804..0000000
--- a/vignettes/nse.Rmd
+++ /dev/null
@@ -1,102 +0,0 @@
----
-title: "Non-standard evaluation"
-date: "`r Sys.Date()`"
-output: rmarkdown::html_vignette
-vignette: >
- %\VignetteIndexEntry{Non-standard evaluation}
- %\VignetteEngine{knitr::rmarkdown}
- %\usepackage[utf8]{inputenc}
----
-
-```{r, echo = FALSE, message = FALSE}
-knitr::opts_chunk$set(collapse = T, comment = "#>")
-options(tibble.print_min = 4L, tibble.print_max = 4L)
-library(dplyr)
-```
-
-Dplyr uses non-standard evaluation (NSE) in all the important single table verbs: `filter()`, `mutate()`, `summarise()`, `arrange()`, `select()` and `group_by()`. NSE is important not only because it reduces typing; for database backends, it's what makes it possible to translate R code into SQL. However, while NSE is great for interactive use it's hard to program with. This vignette describes how you can opt out of NSE in dplyr, and instead (with a little quoting) rely only on standard e [...]
-
-Behind the scenes, NSE is powered by the [lazyeval](https://github.com/hadley/lazyeval) package. The goal is to provide an approach to NSE that you can learn once and then apply in many places (dplyr is the first of my packages to use this approach, but over time I will implement it everywhere). You may want to read the lazyeval vignettes, if you'd like to learn more about the underlying details, or if you'd like to use this approach in your own packages.
-
-## Standard evaluation basics
-
-Every function in dplyr that uses NSE also has a version that uses SE. The name of the SE version is always the NSE name with an `_` on the end. For example, the SE version of `summarise()` is `summarise_()`; the SE version of `arrange()` is `arrange_()`. These functions work very similarly to their NSE cousins, but their inputs must be "quoted":
-
-```{r}
-# NSE version:
-summarise(mtcars, mean(mpg))
-
-# SE versions:
-summarise_(mtcars, ~mean(mpg))
-summarise_(mtcars, quote(mean(mpg)))
-summarise_(mtcars, "mean(mpg)")
-```
-
-There are three ways to quote inputs that dplyr understands:
-
-* With a formula, `~ mean(mpg)`.
-* With `quote()`, `quote(mean(mpg))`.
-* As a string: `"mean(mpg)"`.
-
-It's best to use a formula because a formula captures both the expression to evaluate and the environment where the evaluation occurs. This is important if the expression is a mixture of variables in a data frame and objects in the local environment:
-
-```{r}
-constant1 <- function(n) ~n
-summarise_(mtcars, constant1(4))
-```
-
-```{r, error = TRUE, purl = FALSE}
-# Using anything other than a formula will fail because it doesn't
-# know which environment to look in
-constant2 <- function(n) quote(n)
-summarise_(mtcars, constant2(4))
-```
-
-## Setting variable names
-
-If you also want output variables to vary, you need to pass a list of quoted objects to the `.dots` argument:
-
-```{r}
-n <- 10
-dots <- list(~mean(mpg), ~n)
-summarise_(mtcars, .dots = dots)
-
-summarise_(mtcars, .dots = setNames(dots, c("mean", "count")))
-```
-
-## Mixing constants and variables
-
-What if you need to mingle constants and variables? Use the handy `lazyeval::interp()`:
-
-```{r}
-library(lazyeval)
-# Interp works with formulas, quoted calls and strings (but formulas are best)
-interp(~ x + y, x = 10)
-interp(quote(x + y), x = 10)
-interp("x + y", x = 10)
-
-# Use as.name if you have a character string that gives a variable name
-interp(~ mean(var), var = as.name("mpg"))
-# or supply the quoted name directly
-interp(~ mean(var), var = quote(mpg))
-```
-
-Because [every action in R is a function call](http://adv-r.had.co.nz/Functions.html#all-calls) you can use this same idea to modify functions:
-
-```{r}
-interp(~ f(a, b), f = quote(mean))
-interp(~ f(a, b), f = as.name("+"))
-interp(~ f(a, b), f = quote(`if`))
-```
-
-If you already have a list of values, use `.values`:
-
-```{r}
-interp(~ x + y, .values = list(x = 10))
-
-# You can also interpolate variables defined in the current
-# environment, but this is a little risky becuase it's easy
-# for this to change without you realising
-y <- 10
-interp(~ x + y, .values = environment())
-```
diff --git a/inst/doc/nse.html b/vignettes/nse.html
similarity index 98%
rename from inst/doc/nse.html
rename to vignettes/nse.html
index 76d404b..07ab073 100644
--- a/inst/doc/nse.html
+++ b/vignettes/nse.html
@@ -11,7 +11,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1">
-<meta name="date" content="2016-06-23" />
+<meta name="date" content="2016-10-27" />
<title>Non-standard evaluation</title>
@@ -68,7 +68,7 @@ code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Inf
<h1 class="title toc-ignore">Non-standard evaluation</h1>
-<h4 class="date"><em>2016-06-23</em></h4>
+<h4 class="date"><em>2016-10-27</em></h4>
@@ -107,7 +107,7 @@ code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Inf
<span class="co"># know which environment to look in</span>
constant2 <-<span class="st"> </span>function(n) <span class="kw">quote</span>(n)
<span class="kw">summarise_</span>(mtcars, <span class="kw">constant2</span>(<span class="dv">4</span>))
-<span class="co">#> Error in eval(expr, envir, enclos): binding not found: 'n'</span></code></pre></div>
+<span class="co">#> Error in eval(substitute(expr), envir, enclos): binding not found: 'n'</span></code></pre></div>
</div>
<div id="setting-variable-names" class="section level2">
<h2>Setting variable names</h2>
@@ -156,7 +156,7 @@ dots <-<span class="st"> </span><span class="kw">list</span>(~<span class="kw
<span class="co"># for this to change without you realising</span>
y <-<span class="st"> </span><span class="dv">10</span>
<span class="kw">interp</span>(~<span class="st"> </span>x +<span class="st"> </span>y, <span class="dt">.values =</span> <span class="kw">environment</span>())
-<span class="co">#> ~x + 10</span></code></pre></div>
+<span class="co">#> ~c(1, NA, 2, 3, NA) + 10</span></code></pre></div>
</div>
diff --git a/vignettes/programming.Rmd b/vignettes/programming.Rmd
new file mode 100644
index 0000000..43eadb4
--- /dev/null
+++ b/vignettes/programming.Rmd
@@ -0,0 +1,587 @@
+---
+title: "Programming with dplyr"
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{Programming with dplyr}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\usepackage[utf8]{inputenc}
+---
+
+```{r setup, echo = FALSE, message = FALSE}
+knitr::opts_chunk$set(collapse = T, comment = "#>")
+options(tibble.print_min = 4L, tibble.print_max = 4L)
+library(dplyr)
+set.seed(1014)
+```
+
+Most dplyr functions use non-standard evaluation (NSE). This is a catch-all term that means they don't follow the usual R rules of evaluation. Instead, they capture the expression that you typed and evaluate it in a custom way. This has two main benefits for dplyr code:
+
+* Operations on data frames can be expressed succinctly because
+ you don't need to repeat the name of the data frame. For example,
+ you can write `filter(df, x == 1, y == 2, z == 3)` instead of
+ `df[df$x == 1 & df$y ==2 & df$z == 3, ]`.
+
+* dplyr can choose to compute results in a different way to base R.
+ This is important for database backends because dplyr itself doesn't
+ do any work, but instead generates the SQL that tells the database
+ what to do.
+
+Unfortunately these benefits do not come for free. There are two main drawbacks:
+
+* Most dplyr arguments are not __referentially transparent__. That means
+ you can't replace a value with a seemingly equivalent object that you've
+ defined elsewhere. In other words, this code:
+
+ ```{r}
+ df <- tibble(x = 1:3, y = 3:1)
+ filter(df, x == 1)
+ ```
+
+ Is not equivalent to this code:
+
+ ```{r, error = TRUE}
+ my_var <- x
+ filter(df, my_var == 1)
+ ```
+
+ nor to this code:
+
+ ```{r, error = TRUE}
+ my_var <- "x"
+ filter(df, my_var == 1)
+ ```
+
+ This makes it hard to create functions with arguments that change how
+ dplyr verbs are computed.
+
+* dplyr code is ambiguous. Depending on what variables are defined where,
+ `filter(df, x == y)` could be equivalent to any of:
+
+ ```{r, eval = FALSE}
+ df[df$x == df$y, ]
+ df[df$x == y, ]
+ df[x == df$y, ]
+ df[x == y, ]
+ ```
+
+ This is useful when working interactively (because it saves typing and you
+ quickly spot problems) but makes functions more unpredictable than you
+ might desire.
+
+Fortunately, dplyr provides tools to overcome these challenges. They require a little more typing, but a small amount of upfront work is worth it because they help you save time in the long run.
+
+This vignette has two goals:
+
+* Show you how to use dplyr's __pronouns__ and __quasiquotation__
+ to write reliable functions that reduce duplication in your data analysis
+ code.
+
+* To teach you the underlying theory including __quosures__, the data
+ structure that stores both an expression and an environment, and
+ __tidyeval__, the underlying toolkit.
+
+We'll start with a warmup, tying this problem to something you're more familiar with, then move on to some practical tools, then dive into the deeper theory.
+
+## Warm up
+
+You might not have realised it, but you're already accomplished at solving this type of problem in another domain: strings. It's obvious that this function doesn't do what you want:
+
+```{r}
+greet <- function(name) {
+ "How do you do, name?"
+}
+greet("Hadley")
+```
+
+That's because `"` "quotes" its input: it doesn't interpret what you've typed, it just stores it in a string. One way to make the function do what you want is to use `paste()` to build up the string piece by piece:
+
+```{r}
+greet <- function(name) {
+ paste0("How do you do, ", name, "?")
+}
+greet("Hadley")
+```
+
+Another approach is exemplified by the __glue__ package: it allows you to "unquote" components of a string, replacing the string with the value of the R expression. This allows an elegant implementation of our function because `{name}` is replaced with the value of the `name` argument.
+
+```{r}
+greet <- function(name) {
+ glue::glue("How do you do, {name}?")
+}
+greet("Hadley")
+```
+
+## Programming recipes
+
+The following recipes walk you through the basics of tidyeval, with the nominal goal of reducing duplication in dplyr code. The examples here are somewhat inauthentic because we've reduced them down to very simple components to make them easier to understand. They're so simple that you might wonder why we bother writing a function at all. But it's a good idea to learn the ideas on simple examples, so that you're better prepared to apply them to the more complex situations you'll see in y [...]
+
+### Different data sets
+
+You already know how to write functions that work with the first argument of dplyr verbs: the data. That's because dplyr doesn't do anything special with that argument, so it's referentially transparent. For example, if you saw repeated code like this:
+
+```{r, eval = FALSE}
+mutate(df1, y = a + x)
+mutate(df2, y = a + x)
+mutate(df3, y = a + x)
+mutate(df4, y = a + x)
+```
+
+You could already write a function to capture that duplication:
+
+```{r}
+mutate_y <- function(df) {
+ mutate(df, y = a + x)
+}
+```
+
+Unfortunately, there's a drawback to this simple approach: it can fail silently if one of the variables isn't present in the data frame, but is present in the global environment.
+
+```{r}
+df1 <- tibble(x = 1:3)
+a <- 10
+mutate_y(df1)
+```
+
+We can fix that ambiguity by being more explicit and using the `.data` pronoun. This will throw an informative error if the variable doesn't exist:
+
+```{r, error = TRUE}
+mutate_y <- function(df) {
+ mutate(df, y = .data$a + .data$x)
+}
+
+mutate_y(df1)
+```
+
+If this function is in a package, using `.data` also prevents `R CMD check` from giving a NOTE about undefined global variables (provided that you've also imported `rlang::.data` with `@importFrom rlang .data`).
+
+### Different expressions
+
+Writing a function is hard if you want one of the arguments to be a variable name (like `x`) or an expression (like `x + y`). That's because dplyr automatically "quotes" those inputs, so they are not referentially transparent. Let's start with a simple case: you want to vary the grouping variable for a data summarization.
+
+```{r}
+df <- tibble(
+ g1 = c(1, 1, 2, 2, 2),
+ g2 = c(1, 2, 1, 2, 1),
+ a = sample(5),
+ b = sample(5)
+)
+
+df %>%
+ group_by(g1) %>%
+ summarise(a = mean(a))
+
+df %>%
+ group_by(g2) %>%
+ summarise(a = mean(a))
+```
+
+You might hope that this will work:
+
+```{r, error = TRUE}
+my_summarise <- function(df, group_var) {
+ df %>%
+ group_by(group_var) %>%
+ summarise(a = mean(a))
+}
+
+my_summarise(df, g1)
+```
+
+But it doesn't.
+
+Maybe providing the variable name as a string will fix things?
+
+```{r, error = TRUE}
+my_summarise(df, "g2")
+```
+
+Nope.
+
+If you look carefully at the error message, you'll see that it's the same in both cases. `group_by()` works like `"`: it doesn't evaluate its input; it quotes it.
+
+To make this function work, we need to do two things. We need to quote the input ourselves (so `my_summarise()` can take a bare variable name like `group_by()`), and then we need to tell `group_by()` not to quote its input (because we've done the quoting).
+
+How do we quote the input? We can't use `""` to quote the input, because that gives us a string. Instead we need a function that captures the expression and its environment (we'll come back to why this is important later on). There are two possible options we could use in base R, the function `quote()` and the operator `~`. Neither of these work quite the way we want, so we need a new function: `quo()`.
+
+`quo()` works like `"`: it quotes its input rather than evaluating it.
+
+```{r}
+quo(g1)
+quo(a + b + c)
+quo("a")
+```
+
+`quo()` returns a __quosure__, which is a special type of formula. You'll learn more about quosures later on.
+
+Now that we've captured this expression, how do we use it with `group_by()`? It doesn't work if we just shove it into our naive approach:
+
+```{r, error = TRUE}
+my_summarise(df, quo(g1))
+```
+
+We get the same error as before, because we haven't yet told `group_by()` that we're taking care of the quoting. In other words, we need to tell `group_by()` not to quote its input, because it has been pre-quoted by `my_summarise()`. Yet another way of saying the same thing is that we want to __unquote__ `group_var`.
+
+In dplyr (and in tidyeval in general) you use `!!` to say that you want to unquote an input so that it's evaluated, not quoted. This gives us a function that actually does what we want.
+
+```{r}
+my_summarise <- function(df, group_var) {
+ df %>%
+ group_by(!!group_var) %>%
+ summarise(a = mean(a))
+}
+
+my_summarise(df, quo(g1))
+```
+
+Huzzah!
+
+There's just one step left: we want to call this function like we call `group_by()`:
+
+```{r, eval = FALSE}
+my_summarise(df, g1)
+```
+
+This doesn't work because there's no object called `g1`. We need to capture what the user of the function typed and quote it for them. You might try using `quo()` to do that:
+
+```{r, error = TRUE}
+my_summarise <- function(df, group_var) {
+ quo_group_var <- quo(group_var)
+ print(quo_group_var)
+
+ df %>%
+ group_by(!!quo_group_var) %>%
+ summarise(a = mean(a))
+}
+
+my_summarise(df, g1)
+```
+
+I've added a `print()` call to make it obvious what's going wrong here: `quo(group_var)` always returns `~group_var`. It is being too literal! We want it to substitute the value that the user supplied, i.e. to return `~g1`.
+
+By analogy to strings, we don't want `""`, instead we want some function that turns an argument into a string. That's the job of `enquo()`. `enquo()` uses some dark magic to look at the argument, see what the user typed, and return that value as a quosure. (Technically, this works because function arguments are evaluated lazily, using a special data structure called a __promise__.)
+
+```{r}
+my_summarise <- function(df, group_var) {
+ group_var <- enquo(group_var)
+ print(group_var)
+
+ df %>%
+ group_by(!!group_var) %>%
+ summarise(a = mean(a))
+}
+
+my_summarise(df, g1)
+```
+
+(If you're familiar with `quote()` and `substitute()` in base R, `quo()` is equivalent to `quote()` and `enquo()` is equivalent to `substitute()`.)
+
+You might wonder how to extend this to handle multiple grouping variables: we'll come back to that a little later.
+
+### Different input variable
+
+Now let's tackle something a bit more complicated. The code below shows a duplicate `summarise()` statement where we compute three summaries, varying the input variable.
+
+```{r}
+summarise(df, mean = mean(a), sum = sum(a), n = n())
+summarise(df, mean = mean(a * b), sum = sum(a * b), n = n())
+```
+
+To turn this into a function, we start by testing the basic approach interactively: we quote the variable with `quo()`, then unquoting it in the dplyr call with `!!`. Notice that we can unquote anywhere inside a complicated expression.
+
+```{r}
+my_var <- quo(a)
+summarise(df, mean = mean(!!my_var), sum = sum(!!my_var), n = n())
+```
+
+You can also wrap `quo()` around the dplyr call to see what will happen from dplyr's perspective. This is a very useful tool for debugging.
+
+```{r}
+quo(summarise(df,
+ mean = mean(!!my_var),
+ sum = sum(!!my_var),
+ n = n()
+))
+```
+
+Now we can turn our code into a function (remembering to replace `quo()` with `enquo()`), and check that it works:
+
+```{r}
+my_summarise2 <- function(df, expr) {
+ expr <- enquo(expr)
+
+ summarise(df,
+ mean = mean(!!expr),
+ sum = sum(!!expr),
+ n = n()
+ )
+}
+my_summarise2(df, a)
+my_summarise2(df, a * b)
+```
+
+### Different input and output variable
+
+The next challenge is to vary the name of the output variables:
+
+```{r}
+mutate(df, mean_a = mean(a), sum_a = sum(a))
+mutate(df, mean_b = mean(b), sum_b = sum(b))
+```
+
+This code is similar to the previous example, but there are two new wrinkles:
+
+* We create the new names by pasting together strings, so
+ we need `quo_name()` to convert the input expression to a string.
+
+* `!!mean_name = mean(!!expr)` isn't valid R code, so we need to
+ use the `:=` helper provided by rlang.
+
+```{r}
+my_mutate <- function(df, expr) {
+ expr <- enquo(expr)
+ mean_name <- paste0("mean_", quo_name(expr))
+ sum_name <- paste0("sum_", quo_name(expr))
+
+ mutate(df,
+ !!mean_name := mean(!!expr),
+ !!sum_name := sum(!!expr)
+ )
+}
+
+my_mutate(df, a)
+```
+
+### Capturing multiple variables
+
+It would be nice to extend `my_summarise()` to accept any number of grouping variables. We need to make three changes:
+
+* Use `...` in the function definition so our function can accept any number
+ of arguments.
+
+* Use `quos()` to capture all the `...` as a list of formulas.
+
+* Use `!!!` instead of `!!` to __splice__ the arguments into `group_by()`.
+
+```{r}
+my_summarise <- function(df, ...) {
+ group_var <- quos(...)
+
+ df %>%
+ group_by(!!!group_var) %>%
+ summarise(a = mean(a))
+}
+
+my_summarise(df, g1, g2)
+```
+
+`!!!` takes a list of elements and splices them into to the current call. Look at the bottom of the `!!!` and think `...`.
+
+```{r}
+args <- list(na.rm = TRUE, trim = 0.25)
+quo(mean(x, !!! args))
+
+args <- list(quo(x), na.rm = TRUE, trim = 0.25)
+quo(mean(!!! args))
+```
+
+Now that you've learned the basics of tidyeval through some practical examples, we'll dive into the theory. This will help you generalise what you've learned here to new situations.
+
+## Quoting
+
+Quoting is the action of capturing an expression instead of evaluating it. All expression-based functions quote their arguments and get the R code as an expression rather than the result of evaluating that code. If you are an R user, you probably quote expressions on a regular basis. One of the most important quoting operators in R is the _formula_. It is famously used for the specification of statistical models:
+
+```{r}
+disp ~ cyl + drat
+```
+
+The other quoting operator in base R is `quote()`. It returns a raw
+expression rather than a formula:
+
+```{r}
+# Computing the value of the expression:
+toupper(letters[1:5])
+
+# Capturing the expression:
+quote(toupper(letters[1:5]))
+```
+
+(Note that despite being called the double quote, `"` is not a quoting operator in this context, because it generates a string, not an expression.)
+
+In practice, the formula is the better of the two options because it captures the code and its execution __environment__. This is important because even simple expression can yield different values in different environments. For example, the `x` in the following two expressions refers to different values:
+
+```{r}
+f <- function(x) {
+ quo(x)
+}
+
+x1 <- f(10)
+x2 <- f(100)
+```
+
+It might look like the expressions are the same if you print them out.
+
+```{r}
+x1
+x2
+```
+
+But if you inspect the environments using `rlang::get_env()` --- they're different.
+```{r, message = FALSE}
+library(rlang)
+
+get_env(x1)
+get_env(x2)
+```
+
+Further, when we evaluate those formulas using `rlang::eval_tidy()`, we see that they yield different values:
+
+```{r}
+eval_tidy(x1)
+eval_tidy(x2)
+```
+
+This is a key property of R: one name can refer to different values in different environments. This is also important for dplyr, because it allows you to combine variables and objects in a call:
+
+```{r}
+user_var <- 1000
+mtcars %>% summarise(cyl = mean(cyl) * user_var)
+```
+
+When an object keeps track of an environment, it is said to have an enclosure. This is the reason that functions in R are sometimes referred to as closures:
+
+```{r}
+typeof(mean)
+```
+
+For this reason we use a special name to refer to one-sided formulas: __quosures__. One-sided formulas are quotes (they carry an expression) with an environment.
+
+Quosures are regular R objects. They can be stored in a variable and inspected:
+
+```{r}
+var <- ~toupper(letters[1:5])
+var
+
+# You can extract its expression:
+get_expr(var)
+
+# Or inspect its enclosure:
+get_env(var)
+```
+
+## Quasiquotation
+
+> Put simply, quasi-quotation enables one to introduce symbols that stand for
+> a linguistic expression in a given instance and are used as that linguistic
+> expression in a different instance.
+--- [Willard van Orman Quine](https://en.wikipedia.org/wiki/Quasi-quotation)
+
+Automatic quoting makes dplyr very convenient for interactive use. But if you want to program with dplyr, you need some way to refer to variables indirectly. The solution to this problem is __quasiquotation__, which allows you to evaluate directly inside an expression that is otherwise quoted.
+
+Quasiquotation was coined by Willard van Orman Quine in the 1940s, and was adopted for programming by the LISP community in the 1970s. All expression-based functions in the tidyeval framework support quasiquotation. Unquoting cancels quotation of parts of an expression. There are three types of unquoting:
+
+* basic
+* unquote splicing
+* unquoting names
+
+### Unquoting
+
+The first important operation is the basic unquote, which comes in a functional form, `UQ()`, and as syntactic-sugar, `!!`.
+
+```{r}
+# Here we capture `letters[1:5]` as an expression:
+quo(toupper(letters[1:5]))
+
+# Here we capture the value of `letters[1:5]`
+quo(toupper(!!letters[1:5]))
+quo(toupper(UQ(letters[1:5])))
+```
+
+It is also possible to unquote other quoted expressions. Unquoting such
+symbolic objects provides a powerful way of manipulating expressions.
+
+```{r}
+var1 <- quo(letters[1:5])
+quo(toupper(!!var1))
+```
+
+You can safely unquote quosures because they track their environments, and tidyeval functions know how to evaluate them. This allows any depth of quoting and unquoting.
+
+```{r}
+my_mutate <- function(x) {
+ mtcars %>%
+ select(cyl) %>%
+ slice(1:4) %>%
+ mutate(cyl2 = cyl + (!! x))
+}
+
+f <- function(x) quo(x)
+expr1 <- f(100)
+expr2 <- f(10)
+
+my_mutate(expr1)
+my_mutate(expr2)
+```
+
+The functional form is useful in cases where the precedence of `!` causes problems:
+
+```{r, error = TRUE}
+my_fun <- quo(fun)
+quo(!!my_fun(x, y, z))
+quo(UQ(my_fun)(x, y, z))
+
+my_var <- quo(x)
+quo(filter(df, !!my_var == 1))
+quo(filter(df, UQ(my_var) == 1))
+```
+
+You'll note above that `UQ()` yields a quosure containing a formula. That ensures that when the quosure is evaluated, it'll be looked up in the right environment. In certain code-generation scenarios you just want to use expression and ignore the environment. That's the job of `UQE()`:
+
+```{r}
+quo(UQE(my_fun)(x, y, z))
+quo(filter(df, UQE(my_var) == 1))
+```
+
+`UQE()` is for expert use only as you'll have to carefully analyse the environments to ensure that the generated code is correct.
+
+### Unquote-splicing
+
+The second unquote operation is unquote-splicing. Its functional form is `UQS()` and the syntactic shortcut is `!!!`. It takes a vector and inserts each element of the vector in the surrounding function call:
+
+```{r}
+quo(list(!!! letters[1:5]))
+```
+
+A very useful feature of unquote-splicing is that the vector names
+become argument names:
+
+```{r}
+x <- list(foo = 1L, bar = quo(baz))
+quo(list(!!! x))
+```
+
+This makes it easy to program with dplyr verbs that take named dots:
+
+```{r}
+args <- list(mean = quo(mean(cyl)), count = quo(n()))
+mtcars %>%
+ group_by(am) %>%
+ summarise(!!! args)
+```
+
+### Setting variable names
+
+The final unquote operation is setting argument names. You've seen one way to do that above, but you can also use the definition operator `:=` instead of `=`. `:=` supports unquoting on both the LHS and the RHS.
+
+The rules on the LHS are slightly different: the unquoted operand should evaluate to a string or a symbol.
+
+```{r}
+mean_nm <- "mean"
+count_nm <- "count"
+
+mtcars %>%
+ group_by(am) %>%
+ summarise(
+ !!mean_nm := mean(cyl),
+ !!count_nm := n()
+ )
+```
diff --git a/vignettes/two-table.Rmd b/vignettes/two-table.Rmd
index 7e5bae7..73d88a6 100644
--- a/vignettes/two-table.Rmd
+++ b/vignettes/two-table.Rmd
@@ -1,6 +1,5 @@
---
title: "Two-table verbs"
-date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Two-table verbs}
@@ -186,25 +185,6 @@ setdiff(df1, df2)
setdiff(df2, df1)
```
-## Databases
-
-Each two-table verb has a straightforward SQL equivalent:
-
-| R | SQL
-|------------------|--------
-| `inner_join()` | `SELECT * FROM x JOIN y ON x.a = y.a`
-| `left_join()` | `SELECT * FROM x LEFT JOIN y ON x.a = y.a`
-| `right_join()` | `SELECT * FROM x RIGHT JOIN y ON x.a = y.a`
-| `full_join()` | `SELECT * FROM x FULL JOIN y ON x.a = y.a`
-| `semi_join()` | `SELECT * FROM x WHERE EXISTS (SELECT 1 FROM y WHERE x.a = y.a)`
-| `anti_join()` | `SELECT * FROM x WHERE NOT EXISTS (SELECT 1 FROM y WHERE x.a = y.a)`
-| `intersect(x, y)`| `SELECT * FROM x INTERSECT SELECT * FROM y`
-| `union(x, y)` | `SELECT * FROM x UNION SELECT * FROM y`
-| `setdiff(x, y)` | `SELECT * FROM x EXCEPT SELECT * FROM y`
-
-`x` and `y` don't have to be tables in the same database. If you specify `copy = TRUE`, dplyr will copy the `y` table into the same location as the `x` variable. This is useful if you've downloaded a summarised dataset and determined a subset of interest that you now want the full data for. You can use `semi_join(x, y, copy = TRUE)` to upload the indices of interest to a temporary table in the same database as `x`, and then perform a efficient semi join in the database.
-
-If you're working with large data, it maybe also be helpful to set `auto_index = TRUE`. That will automatically add an index on the join variables to the temporary table.
## Coercion rules
@@ -257,4 +237,4 @@ full_join(df1, df2) %>% str()
## Multiple-table verbs
-dplyr does not provide any functions for working with three or more tables. Instead use `Reduce()`, as described in [Advanced R](http://adv-r.had.co.nz/Functionals.html#functionals-fp), to iteratively combine the two-table verbs to handle as many tables as you need.
+dplyr does not provide any functions for working with three or more tables. Instead use `purrr::reduce()` or `Reduce()`, as described in [Advanced R](http://adv-r.had.co.nz/Functionals.html#functionals-fp), to iteratively combine the two-table verbs to handle as many tables as you need.
diff --git a/vignettes/window-functions.Rmd b/vignettes/window-functions.Rmd
index 88dcc27..b89b9a0 100644
--- a/vignettes/window-functions.Rmd
+++ b/vignettes/window-functions.Rmd
@@ -1,29 +1,38 @@
---
-title: "Window functions and grouped mutate/filter"
-date: "`r Sys.Date()`"
+title: "Window functions"
output: rmarkdown::html_vignette
vignette: >
- %\VignetteIndexEntry{Window functions and grouped mutate/filter}
+ %\VignetteIndexEntry{Window functions}
%\VignetteEngine{knitr::rmarkdown}
%\usepackage[utf8]{inputenc}
---
-```{r, echo = FALSE, message = FALSE}
+```{r, include = FALSE}
knitr::opts_chunk$set(collapse = T, comment = "#>")
options(tibble.print_min = 4L, tibble.print_max = 4L)
library(dplyr)
+set.seed(1014)
```
A __window function__ is a variation on an aggregation function. Where an aggregation function, like `sum()` and `mean()`, takes n inputs and return a single value, a window function returns n values. The output of a window function depends on all its input values, so window functions don't include functions that work element-wise, like `+` or `round()`. Window functions include variations on aggregate functions, like `cumsum()` and `cummean()`, functions for ranking and ordering, like ` [...]
-Window functions are used in conjunction with `mutate` and `filter` to solve a wide range of problems, some of which are shown below:
+In this vignette, we'll use a small sample of the Lahman batting dataset, including the players that have won an award.
-```{r, results = "hide"}
+```{r}
library(Lahman)
-batting <- select(tbl_df(Batting), playerID, yearID, teamID, G, AB:H)
-batting <- arrange(batting, playerID, yearID, teamID)
-players <- group_by(batting, playerID)
+batting <- Lahman::Batting %>%
+ as_tibble() %>%
+ select(playerID, yearID, teamID, G, AB:H) %>%
+ arrange(playerID, yearID, teamID) %>%
+ semi_join(Lahman::AwardsPlayers, by = "playerID")
+
+players <- batting %>% group_by(playerID)
+```
+
+Window functions are used in conjunction with `mutate()` and `filter()` to solve a wide range of problems. Here's a selection:
+
+```{r, eval = FALSE}
# For each player, find the two years with most hits
filter(players, min_rank(desc(H)) <= 2 & H > 0)
# Within each player, rank each year by the number of games played
@@ -40,15 +49,13 @@ filter(players, G > mean(G))
mutate(players, G_z = (G - mean(G)) / sd(G))
```
-This vignette is broken down into two sections. First you'll learn about the five families of window functions in R, and what you can use them for. If you're only working with local data sources, you can stop there. Otherwise, continue on to learn about window functions in SQL. They are relatively new, but are supported by Postgres, Amazon's Redshift and Google's bigquery. The window functions themselves are basically the same (modulo a few name conflicts), but their specification is a l [...]
-
-Before reading this vignette, you should be familiar with `mutate()` and `filter()`. If you want to use window functions with SQL databases, you should also be familiar with the basics of dplyr's SQL translation.
+Before reading this vignette, you should be familiar with `mutate()` and `filter()`.
## Types of window functions
There are five main families of window functions. Two families are unrelated to aggregation functions:
-* Ranking and ordering functions: `row_number()`, `min_rank` (`RANK` in SQL),
+* Ranking and ordering functions: `row_number()`, `min_rank()`,
`dense_rank()`, `cume_dist()`, `percent_rank()`, and `ntile()`. These
functions all take a vector to order by, and return various types of ranks.
@@ -63,7 +70,7 @@ The other three families are variations on familiar aggregate functions:
* Rolling aggregates operate in a fixed width window. You won't find them in
base R or in dplyr, but there are many implementations in
other packages, such as
- [RcppRoll](http://cran.r-project.org/package=RcppRoll).
+ [RcppRoll](https://cran.r-project.org/package=RcppRoll).
* Recycled aggregates, where an aggregate is repeated to match the length
of the input. These are not needed in R because vector recycling
@@ -73,7 +80,7 @@ The other three families are variations on familiar aggregate functions:
Each family is described in more detail below, focussing on the general goals and how to use them with dplyr. For more details, refer to the individual function documentation.
-### Ranking functions
+## Ranking functions
The ranking functions are variations on a theme, differing in how they handle ties:
@@ -96,11 +103,7 @@ percent_rank(x)
These are useful if you want to select (for example) the top 10% of records within each group. For example:
-```{r, results = 'hide'}
-# Selects best two years
-filter(players, min_rank(desc(G)) < 2)
-
-# Selects best 10% of years
+```{r}
filter(players, cume_dist(desc(G)) < 0.1)
```
@@ -115,7 +118,7 @@ summarise(by_team_quartile, mean(G))
All ranking functions rank from lowest to highest so that small input values get small ranks. Use `desc()` to rank from highest to lowest.
-### Lead and lag
+## Lead and lag
`lead()` and `lag()` produce offset versions of a input vector that is either ahead of or behind the original vector.
@@ -159,13 +162,13 @@ right <- mutate(scrambled, running = order_by(year, cumsum(value)))
arrange(right, year)
```
-### Cumulative aggregates
+## Cumulative aggregates
Base R provides cumulative sum (`cumsum()`), cumulative min (`cummin()`) and cumulative max (`cummax()`). (It also provides `cumprod()` but that is rarely useful). Other common accumulating functions are `cumany()` and `cumall()`, cumulative versions of `||` and `&&`, and `cummean()`, a cumulative mean. These are not included in base R, but efficient versions are provided by `dplyr`.
`cumany()` and `cumall()` are useful for selecting all rows up to, or all rows after, a condition is true for the first (or last) time. For example, we can use `cumany()` to find all records for a player after they played a year with 150 games:
-```{r, results = "hide"}
+```{r, eval = FALSE}
filter(players, cumany(G > 150))
```
@@ -179,18 +182,18 @@ order_by(y, cumsum(x))
This function uses a bit of non-standard evaluation, so I wouldn't recommend using it inside another function; use the simpler but less concise `with_order()` instead.
-### Recycled aggregates
+## Recycled aggregates
R's vector recycling make it easy to select values that are higher or lower than a summary. I call this a recycled aggregate because the value of the aggregate is recycled to be the same length as the original vector. Recycled aggregates are useful if you want to find all records greater than the mean or less than the median:
-```{r, results = "hide"}
+```{r, eval = FALSE}
filter(players, G > mean(G))
filter(players, G < median(G))
```
While most SQL databases don't have an equivalent of `median()` or `quantile()`, when filtering you can achieve the same effect with `ntile()`. For example, `x > median(x)` is equivalent to `ntile(x, 2) == 2`; `x > quantile(x, 75)` is equivalent to `ntile(x, 100) > 75` or `ntile(x, 4) > 3`.
-```{r, results = "hide"}
+```{r, eval = FALSE}
filter(players, ntile(G, 2) == 2)
```
@@ -207,166 +210,3 @@ Or, as in the introductory example, we could compute a z-score:
```{r}
mutate(players, G_z = (G - mean(G)) / sd(G))
```
-
-## Window functions in SQL
-
-Window functions have a slightly different flavour in SQL. The syntax is a little different, and the cumulative, rolling and recycled aggregate functions are all based on the simple aggregate function. The goal in this section is not to tell you everything you need to know about window functions in SQL, but to remind you of the basics and show you how dplyr translates your R expressions in to SQL.
-
-### Structure of a window function in SQL
-
-In SQL, window functions have the form `[expression] OVER ([partition clause] [order clause] [frame_clause])`:
-
-* The __expression__ is a combination of variable names and window functions.
- Support for window functions varies from database to database, but most
- support the ranking functions, `lead`, `lag`, `nth`, `first`,
- `last`, `count`, `min`, `max`, `sum`, `avg` and `stddev`. dplyr
- generates this from the R expression in your `mutate` or `filter` call.
-
-* The __partition clause__ specifies how the window function is broken down
- over groups. It plays an analogous role to `GROUP BY` for aggregate functions,
- and `group_by()` in dplyr. It is possible for different window functions to
- be partitioned into different groups, but not all databases support it, and
- neither does dplyr.
-
-* The __order clause__ controls the ordering (when it makes a difference).
- This is important for the ranking functions since it specifies which
- variables to rank by, but it's also needed for cumulative functions and lead.
- Whenever you're thinking about before and after in SQL, you must always tell
- it which variable defines the order. In dplyr you do this with `arrange()`.
- If the order clause is missing when needed, some databases fail with an
- error message while others return non-deterministic results.
-
-* The __frame clause__ defines which rows, or __frame__, that are passed
- to the window function, describing which rows (relative to the current row)
- should be included. The frame clause provides two offsets which determine
- the start and end of frame. There are three special values: -Inf means
- to include all preceeding rows (in SQL, "unbounded preceding"), 0 means the
- current row ("current row"), and Inf means all following rows ("unbounded
- following)". The complete set of options is comprehensive, but fairly
- confusing, and is summarised visually below.
-
- ![A visual summary of frame options](windows.png)
-
- Of the many possible specifications, there are only three that commonly
- used. They select between aggregation variants:
-
- * Recycled: `BETWEEN UNBOUND PRECEEDING AND UNBOUND FOLLOWING`
-
- * Cumulative: `BETWEEN UNBOUND PRECEEDING AND CURRENT ROW`
-
- * Rolling: `BETWEEN 2 PRECEEDING AND 2 FOLLOWING`
-
- dplyr generates the frame clause based on whether your using a recycled
- aggregate or a cumulative aggregate.
-
-It's easiest to understand these specifications by looking at a few examples. Simple examples just need the partition and order clauses:
-
-* Rank each year within a player by number of home runs:
- `RANK() OVER (PARTITION BY playerID ORDER BY desc(H))`
-
-* Compute change in number of games from one year to the next:
- `G - LAG(G) OVER (PARTITION G playerID ORDER BY yearID)`
-
-Aggregate variants are more verbose because we also need to supply the frame clause:
-
-* Running sum of G for each player: `SUM(G) OVER (PARTITION BY playerID ORDER BY yearID BETWEEN UNBOUND PRECEEDING AND CURRENT ROW)`
-
-* Compute the career year: `YearID - min(YearID) OVER (PARTITION BY playerID BETWEEN UNBOUND PRECEEDING AND UNBOUND FOLLOWING) + 1`
-
-* Compute a rolling average of games player: `MEAN(G) OVER (PARTITION BY playerID ORDER BY yearID BETWEEN 2 PRECEEDING AND 2 FOLLOWING)`
-
-You'll notice that window functions in SQL are more verbose than in R. This is because different window functions can have different partitions, and the frame specification is more general than the two aggregate variants (recycled and cumulative) provided by dplyr. dplyr makes a tradeoff: you can't access rarely used window function capabilities (unless you write raw SQL), but in return, common operations are much more succinct.
-
-### Translating dplyr to SQL
-
-To see how individual window functions are translated to SQL, we can use `translate_sql()` with the argument `window = TRUE`.
-
-```{r, message = FALSE}
-if (has_lahman("postgres")) {
- players_db <- group_by(tbl(lahman_postgres(), "Batting"), playerID)
-
- print(translate_sql(mean(G), tbl = players_db, window = TRUE))
- print(translate_sql(cummean(G), tbl = players_db, window = TRUE))
- print(translate_sql(rank(G), tbl = players_db, window = TRUE))
- print(translate_sql(ntile(G, 2), tbl = players_db, window = TRUE))
- print(translate_sql(lag(G), tbl = players_db, window = TRUE))
-}
-```
-
-If the tbl has been arranged previously, then that ordering will be used for the order clause:
-
-```{r, message = FALSE}
-if (has_lahman("postgres")) {
- players_by_year <- arrange(players_db, yearID)
- print(translate_sql(cummean(G), tbl = players_by_year, window = TRUE))
- print(translate_sql(rank(), tbl = players_by_year, window = TRUE))
- print(translate_sql(lag(G), tbl = players_by_year, window = TRUE))
-}
-```
-
-There are some challenges when translating window functions between R and SQL, because dplyr tries to keep the window functions as similar as possible to both the existing R analogues and to the SQL functions. This means that there are three ways to control the order clause depending on which window function you're using:
-
-* For ranking functions, the ordering variable is the first argument: `rank(x)`,
- `ntile(y, 2)`. If omitted or `NULL`, will use the default ordering associated
- with the tbl (as set by `arrange()`).
-
-* Accumulating aggegates only take a single argument (the vector to aggregate).
- To control ordering, use `order_by()`.
-
-* Aggregates implemented in dplyr (`lead`, `lag`, `nth_value`, `first_value`,
- `last_value`) have an `order_by` argument. Supply it to override the
- default ordering.
-
-The three options are illustrated in the snippet below:
-
-```{r, eval = FALSE}
-mutate(players,
- min_rank(yearID),
- order_by(yearID, cumsum(G)),
- lead(order_by = yearID, G)
-)
-```
-
-Currently there is no way to order by multiple variables, except by setting the default ordering with `arrange()`. This will be added in a future release.
-
-### Translating filters based on window functions
-
-There are some restrictions on window functions in SQL that make their use with `WHERE` somewhat challenging. Take this simple example, where we want to find the year each player played the most games:
-
-```{r, eval = FALSE}
-filter(players, rank(G) == 1)
-```
-
-The following straightforward translation does not work because window functions are only allowed in `SELECT` and `ORDER_BY`.
-
-```
-SELECT *
-FROM Batting
-WHERE rank() OVER (PARTITION BY "playerID" ORDER BY "G") = 1;
-```
-
-Computing the window function in `SELECT` and referring to it in `WHERE` or `HAVING` doesn't work either, because `WHERE` and `HAVING` are computed before windowing functions.
-
-```
-SELECT *, rank() OVER (PARTITION BY "playerID" ORDER BY "G") as rank
-FROM Batting
-WHERE rank = 1;
-
-SELECT *, rank() OVER (PARTITION BY "playerID" ORDER BY "G") as rank
-FROM Batting
-HAVING rank = 1;
-```
-
-Instead, we must use a subquery:
-
-```
-SELECT *
-FROM (
- SELECT *, rank() OVER (PARTITION BY "playerID" ORDER BY "G") as rank
- FROM Batting
-) tmp
-WHERE rank = 1;
-```
-
-And even that query is a slightly simplification because it will also add a rank column to the original columns. dplyr takes care of generating the full, verbose, query, so you can focus on your data analysis challenges.
-
diff --git a/vignettes/windows.graffle b/vignettes/windows.graffle
deleted file mode 100644
index d8181ae..0000000
Binary files a/vignettes/windows.graffle and /dev/null differ
diff --git a/vignettes/windows.png b/vignettes/windows.png
deleted file mode 100644
index eeae223..0000000
Binary files a/vignettes/windows.png and /dev/null differ
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/r-cran-dplyr.git
More information about the debian-med-commit
mailing list