From 3c5838dcbd4c695640384ac65da13da6afd66d43 Mon Sep 17 00:00:00 2001 From: hadley Date: Thu, 14 Jul 2016 10:57:54 -0500 Subject: [PATCH] data_frame -> tibble --- data-structures.Rmd | 4 ++-- hierarchy.Rmd | 2 +- iteration.Rmd | 8 ++++---- model-assess.Rmd | 9 +++++---- model-basics.Rmd | 8 ++++---- model-many.Rmd | 18 +++++++++--------- relational-data.Rmd | 16 ++++++++-------- robust-code.Rmd | 8 ++++---- transform.Rmd | 8 ++++---- 9 files changed, 41 insertions(+), 40 deletions(-) diff --git a/data-structures.Rmd b/data-structures.Rmd index 4912b03..ac4cafa 100644 --- a/data-structures.Rmd +++ b/data-structures.Rmd @@ -271,7 +271,7 @@ While vector recycling can be used to create very succinct, clever code, it can ```{r, error = TRUE} data.frame(x = 1:4, y = 1:2) -dplyr::data_frame(x = 1:4, y = 1:2) +tibble::tibble(x = 1:4, y = 1:2) purrr::map2(1:4, 1:2, `+`) ``` @@ -643,7 +643,7 @@ The difference between a data frame and a list is that all the elements of a dat In this book, we use tibbles, rather than data frames. Tibbles are identical to data frames, except that they have two additional components in the class: ```{r} -df2 <- dplyr::data_frame(x = 1:5, y = 5:1) +df2 <- tibble::tibble(x = 1:5, y = 5:1) typeof(df2) attributes(df2) ``` diff --git a/hierarchy.Rmd b/hierarchy.Rmd index ba02036..0a1ecc8 100644 --- a/hierarchy.Rmd +++ b/hierarchy.Rmd @@ -112,7 +112,7 @@ It's called transpose by analogy to matrices. When you subset a transposed matri Transpose is also useful when working with JSON APIs. Many JSON APIs represent data frames in a row-based format, rather than R's column-based format. `transpose()` makes it easy to switch between the two: ```{r} -df <- dplyr::data_frame(x = 1:3, y = c("a", "b", "c")) +df <- tibble::tibble(x = 1:3, y = c("a", "b", "c")) df %>% transpose() %>% str() ``` diff --git a/iteration.Rmd b/iteration.Rmd index 9cf7e8e..9256d14 100644 --- a/iteration.Rmd +++ b/iteration.Rmd @@ -774,7 +774,7 @@ knitr::include_graphics("diagrams/lists-pmap-named.png") Since the arguments are all the same length, it makes sense to store them in a data frame: ```{r} -params <- dplyr::data_frame(mean = mu, sd = sigma, n = n) +params <- tibble::tibble(mean = mu, sd = sigma, n = n) params$result <- params %>% pmap(rnorm) params ``` @@ -896,9 +896,9 @@ Sometimes you have a complex list that you want to reduce to a simple list by re ```{r} dfs <- list( - age = tibble::data_frame(name = "John", age = 30), - sex = tibble::data_frame(name = c("John", "Mary"), sex = c("M", "F")), - trt = tibble::data_frame(name = "Mary", treatment = "A") + age = tibble::tibble(name = "John", age = 30), + sex = tibble::tibble(name = c("John", "Mary"), sex = c("M", "F")), + trt = tibble::tibble(name = "Mary", treatment = "A") ) dfs %>% reduce(dplyr::full_join) diff --git a/model-assess.Rmd b/model-assess.Rmd index b8d5034..a82673a 100644 --- a/model-assess.Rmd +++ b/model-assess.Rmd @@ -2,6 +2,7 @@ ```{r setup-model, include=FALSE} library(purrr) +library(tibble) set.seed(1014) options(digits = 3) ``` @@ -118,7 +119,7 @@ true_model <- function(x) { 1 + 2 * x + rnorm(length(x), sd = 0.25) } -df <- data_frame( +df <- tibble( x = seq(0, 1, length = 20), y = true_model(x) ) @@ -161,7 +162,7 @@ fs <- list( y ~ poly(x, 7) ) -models <- data_frame( +models <- tibble( n = 1:7, f = fs, mod = map(f, lm, data = df), @@ -245,7 +246,7 @@ Both the boostrap and cross-validation are build on top of a "resample" object. These functions return an object of class "resample", which represents the resample in a memory efficient way. Instead of storing the resampled dataset itself, it instead stores the integer indices, and a "pointer" to the original dataset. This makes resamples take up much less memory. ```{r} -x <- resample_bootstrap(as_data_frame(mtcars)) +x <- resample_bootstrap(as_tibble(mtcars)) class(x) x @@ -288,7 +289,7 @@ When you start dealing with many models, it's helpful to have some rough way of One way to capture the quality of the model is to summarise the distribution of the residuals. For example, you could look at the quantiles of the absolute residuals. For this dataset, 25% of predictions are less than \$7,400 away, and 75% are less than \$25,800 away. That seems like quite a bit of error when predicting someone's income! ```{r} -heights <- tibble::as_data_frame(readRDS("data/heights.RDS")) +heights <- tibble::as_tibble(readRDS("data/heights.RDS")) h <- lm(income ~ height, data = heights) h diff --git a/model-basics.Rmd b/model-basics.Rmd index e0430b9..50b4bb2 100644 --- a/model-basics.Rmd +++ b/model-basics.Rmd @@ -69,7 +69,7 @@ options( I can be easier to learn about modelling in a simulated environment where we know the truth. For example, imagine we have this data: ```{r} -df <- data_frame( +df <- tibble( x = rep(1:10, each = 3), y = 5 + x * 2 + rnorm(length(x), sd = 2) ) @@ -93,7 +93,7 @@ Model class: linear. Model family: `y = a + b * x`. There are lots of possible models in that family. Here are a few: ```{r} -models <- data_frame( +models <- tibble( a = runif(250, -20, 80), b = runif(250, -5, 5) ) @@ -606,7 +606,7 @@ heights_ed %>% However: there's one major problem with using `poly()`: outside the range of the data, polynomials are going to rapidly shoot off to positive or negative infinity. ```{r} -data_frame(education = seq(5, 25)) %>% +tibble(education = seq(5, 25)) %>% gather_predictions(mod_e1, mod_e2, mod_e3) %>% ggplot(aes(education, pred, colour = model)) + geom_line() @@ -620,7 +620,7 @@ mod_e1 <- lm(income ~ education, data = heights_ed) mod_e2 <- lm(income ~ ns(education, 2), data = heights_ed) mod_e3 <- lm(income ~ ns(education, 3), data = heights_ed) -data_frame(education = seq(5, 25)) %>% +tibble(education = seq(5, 25)) %>% gather_predictions(mod_e1, mod_e2, mod_e3) %>% ggplot(aes(education, pred, colour = model)) + geom_line() diff --git a/model-many.Rmd b/model-many.Rmd index f33b23e..3748bd4 100644 --- a/model-many.Rmd +++ b/model-many.Rmd @@ -290,10 +290,10 @@ data.frame( ) ``` -Tibble alleviates this problem by being lazier (`data_frame()` doesn't modify its inputs) and by providing a better print method: +Tibble alleviates this problem by being lazier (`tibble()` doesn't modify its inputs) and by providing a better print method: ```{r} -data_frame( +tibble( x = list(1:3, 3:5), y = c("1, 2", "3, 4, 5") ) @@ -316,7 +316,7 @@ Generally there are three parts of an effective list-column pipeline: ## Creating list-columns -Typically, you won't create list-columns with `data_frame()`. Instead, you'll create them from regular columns, using one of three methods: +Typically, you won't create list-columns with `tibble()`. Instead, you'll create them from regular columns, using one of three methods: 1. With `tidyr::nest()` to convert a grouped data frame into a nested data frame where you have list-column of data frames. @@ -353,7 +353,7 @@ gapminder %>% Some useful fuctions take an atomic vector and return a list. For example, in [strings] you learned about `stringr::str_split()` which takes a character vector and returns a list of charcter vectors. If you use that inside mutate, you'll get a list-column: ```{r} -df <- data_frame(x1 = c("a,b,c", "d,e,f,g")) +df <- tibble(x1 = c("a,b,c", "d,e,f,g")) df %>% mutate(x2 = stringr::str_split(x1, ",")) @@ -480,7 +480,7 @@ These are described in more detail below. If you can reduce your list column to an atomic vector then it will be a regular column. For example, you can always summarise an object with it's type and length, so this code will work regardless of what sort of list-column you have: ```{r} -df <- data_frame( +df <- tibble( x = list( letters[1:5], 1:3, @@ -499,7 +499,7 @@ This is the same basic information that you get from the default tbl print metho Don't forget about the `map_*()` shortcuts - you can use `map_chr(x, "apple")` to extract the string stored in `apple` for each element of `x`. This is useful for pulling apart nested lists into regular columns. Use the `.null` argument to provide a value to use if the element is missing (instead of returning `NULL`): ```{r} -df <- data_frame( +df <- tibble( x = list( list(a = 1, b = 2), list(a = 2, c = 4) @@ -516,7 +516,7 @@ df %>% mutate( `unnest()` works by repeating the regular columns once for each element of the list-column. For example, in the following very simple example we repeat the first row 4 times (because there the first element of `y` has length four), and the second row once: ```{r} -data_frame(x = 1:2, y = list(1:4, 1)) %>% unnest(y) +tibble(x = 1:2, y = list(1:4, 1)) %>% unnest(y) ``` This means that you can't simultaneously unnest two columns that contain different number of elements: @@ -524,7 +524,7 @@ This means that you can't simultaneously unnest two columns that contain differe ```{r, error = TRUE} # Ok, because y and z have the same number of elements in # every row -df1 <- data_frame( +df1 <- tibble( x = 1:2, y = list(c("a", "b"), "c"), z = list(1:2, 3) @@ -533,7 +533,7 @@ df1 df1 %>% unnest(y, z) # Doesn't work because y and z have different number of elements -df2 <- data_frame( +df2 <- tibble( x = 1:2, y = list("a", c("b", "c")), z = list(1:2, 3) diff --git a/relational-data.Rmd b/relational-data.Rmd index 3af796e..03388ff 100644 --- a/relational-data.Rmd +++ b/relational-data.Rmd @@ -181,8 +181,8 @@ To help you learn how joins work, I'm going to represent data frames visually: knitr::include_graphics("diagrams/join-setup.png") ``` ```{r} -(x <- data_frame(key = c(1, 2, 3), val_x = c("x1", "x2", "x3"))) -(y <- data_frame(key = c(1, 2, 4), val_y = c("y1", "y2", "y3"))) +(x <- tibble(key = c(1, 2, 3), val_x = c("x1", "x2", "x3"))) +(y <- tibble(key = c(1, 2, 4), val_y = c("y1", "y2", "y3"))) ``` The coloured column represents the "key" variable: these are used to match the rows between the tables. The grey column represents the "value" column that is carried along for the ride. In these examples I'll show a single key variable and single value variable, but idea generalises in a straightforward way to multiple keys and multiple values. @@ -262,8 +262,8 @@ So far all the diagrams have assumed that the keys are unique. But that's not al and a foreign key in `x`. ```{r} - x <- data_frame(key = c(1, 2, 2, 1), val_x = str_c("x", 1:4)) - y <- data_frame(key = 1:2, val_y = str_c("y", 1:2)) + x <- tibble(key = c(1, 2, 2, 1), val_x = str_c("x", 1:4)) + y <- tibble(key = 1:2, val_y = str_c("y", 1:2)) left_join(x, y, by = "key") ``` @@ -276,8 +276,8 @@ So far all the diagrams have assumed that the keys are unique. But that's not al ``` ```{r} - x <- data_frame(key = c(1, 2, 2, 3), val_x = str_c("x", 1:4)) - y <- data_frame(key = c(1, 2, 2, 3), val_y = str_c("y", 1:4)) + x <- tibble(key = c(1, 2, 2, 3), val_x = str_c("x", 1:4)) + y <- tibble(key = c(1, 2, 2, 3), val_y = str_c("y", 1:4)) left_join(x, y, by = "key") ``` @@ -497,8 +497,8 @@ All these operations work with a complete row, comparing the values of every var Given this simple data: ```{r} -(df1 <- data_frame(x = 1:2, y = c(1L, 1L))) -(df2 <- data_frame(x = 1:2, y = 1:2)) +(df1 <- tibble(x = 1:2, y = c(1L, 1L))) +(df2 <- tibble(x = 1:2, y = 1:2)) ``` The four possibilities are: diff --git a/robust-code.Rmd b/robust-code.Rmd index 5a17c80..e01f9de 100644 --- a/robust-code.Rmd +++ b/robust-code.Rmd @@ -56,10 +56,10 @@ class(df[, "xy"]) df$x ``` -These features all made sense at the time they were added to R, but computing environments have changed a lot, and these features now tend to cause a lot of problems. dplyr disables them for you: +These features all made sense at the time they were added to R, but computing environments have changed a lot, and these features now tend to cause a lot of problems. tibble disables them for you: ```{r, error = TRUE} -df <- dplyr::data_frame(xy = c("x", "y")) +df <- tibble::tibble(xy = c("x", "y")) class(df$xy) class(df[, "xy"]) df$x @@ -164,7 +164,7 @@ There are two ways in which this function can fail: 1. `df$threshold` might exist: ```{r} - df <- dplyr::data_frame(x = 1:10, threshold = 100) + df <- tibble::tibble(x = 1:10, threshold = 100) big_x(df, 5) ``` @@ -212,7 +212,7 @@ Functions are easiest to reason about if they have two properties: The first property is particularly important. If a function has hidden additional inputs, it's very difficult to even know where the important context is! -The biggest breakers of this rule in base R are functions that create data frames. Most of these functions have a `stringsAsFactors` argument that defaults to `getOption("stringsAsFactors")`. This means that a global option affects the operation of a very large number of functions, and you need to be aware that, depending on an external state, a function might produce either a character vector or a factor. In this book, we steer you away from that problem by recommending functions like `readr::read_csv()` and `dplyr::data_frame()` that don't rely on this option. But be aware of it! Generally if a function is affected by a global option, you should avoid setting it. +The biggest breakers of this rule in base R are functions that create data frames. Most of these functions have a `stringsAsFactors` argument that defaults to `getOption("stringsAsFactors")`. This means that a global option affects the operation of a very large number of functions, and you need to be aware that, depending on an external state, a function might produce either a character vector or a factor. In this book, we steer you away from that problem by recommending functions like `readr::read_csv()` and `tibble::tibble()` that don't rely on this option. But be aware of it! Generally if a function is affected by a global option, you should avoid setting it. Only use `options()` to control side-effects of a function. The value of an option should never affect the return value of a function. There are only three violations of this rule in base R: `stringsAsFactors`, `encoding`, `na.action`. For example, base R lets you control the number of digits printed in default displays with (e.g.) `options(digits = 3)`. This is a good use of an option because it's something that people frequently want control over, but doesn't affect the computation of a result, just its display. Follow this principle with your own use of options. diff --git a/transform.Rmd b/transform.Rmd index ac285a5..0cf7a1d 100644 --- a/transform.Rmd +++ b/transform.Rmd @@ -172,7 +172,7 @@ Note that R has both `&` and `|` and `&&` and `||`. `&` and `|` are vectorised: Sometimes you want to find all rows after the first `TRUE`, or all rows until the first `FALSE`. The window functions `cumany()` and `cumall()` allow you to find these values: ```{r} -df <- data_frame( +df <- tibble( x = c(FALSE, TRUE, FALSE), y = c(TRUE, FALSE, TRUE) ) @@ -219,7 +219,7 @@ If you want to determine if a value is missing, use `is.na()`. (This is such a c `filter()` only includes rows where the condition is `TRUE`; it excludes both `FALSE` and `NA` values. If you want to preserve missing values, ask for them explicitly: ```{r} -df <- data_frame(x = c(1, NA, 3)) +df <- tibble(x = c(1, NA, 3)) filter(df, x > 1) filter(df, is.na(x) | x > 1) ``` @@ -260,7 +260,7 @@ arrange(flights, desc(arr_delay)) Missing values are always sorted at the end: ```{r} -df <- data_frame(x = c(5, 2, NA)) +df <- tibble(x = c(5, 2, NA)) arrange(df, x) arrange(df, desc(x)) ``` @@ -484,7 +484,7 @@ There are many functions for creating new variables that you can use with `mutat ```{r} y <- c(1, 2, 2, NA, 3, 4) - data_frame( + tibble( row_number(y), min_rank(y), dense_rank(y),