From daaa861f74e9c8171f8a5937797433832e98940b Mon Sep 17 00:00:00 2001 From: hadley Date: Fri, 7 Oct 2016 08:16:09 -0500 Subject: [PATCH] Tibbles to tribbles @jennybc --- import.Rmd | 8 ++++--- model-basics.Rmd | 7 +++++- model-many.Rmd | 52 +++++++++++++++++++++++++---------------- relational-data.Rmd | 56 ++++++++++++++++++++++++++++++++++++++------- visualize.Rmd | 9 ++++---- 5 files changed, 96 insertions(+), 36 deletions(-) diff --git a/import.Rmd b/import.Rmd index d845074..840521e 100644 --- a/import.Rmd +++ b/import.Rmd @@ -548,9 +548,11 @@ There are a few other general strategies to help you parse files: frame. ```{r} - df <- tibble( - x = c("1", "2", "3"), - y = c("1.21", "2.32", "4.56") + df <- tribble( + ~x, ~y, + "1", "1.21", + "2", "2.32", + "3", "4.56" ) df diff --git a/model-basics.Rmd b/model-basics.Rmd index 3dc09c6..524acb2 100644 --- a/model-basics.Rmd +++ b/model-basics.Rmd @@ -542,7 +542,12 @@ You can also perform transformations inside the model formula. For example, `log Again, if you get confused about what your model is doing, you can always use `model_matrix()` to see exactly what equation `lm()` is fitting: ```{r} -df <- tibble(y = 1:3, x = 1:3) +df <- tribble( + ~y, ~x, + 1, 1, + 2, 2, + 3, 3 +) model_matrix(df, y ~ x^2 + x) model_matrix(df, y ~ I(x^2) + x) ``` diff --git a/model-many.Rmd b/model-many.Rmd index e713c28..a71074c 100644 --- a/model-many.Rmd +++ b/model-many.Rmd @@ -295,6 +295,16 @@ tibble( ) ``` +It's even easier with `tribble()` as it can automatically work out that you need a list: + +```{r} +tribble( + ~x, ~y, + 1:3, "1, 2", + 3:5, "3, 4, 5" +) +``` + List-columns are often most useful as intermediate data structure. They're hard to work with directly, because most R functions work with atomic vectors or data frames, but the advantage of keeping related items together in a data frame is worth a little hassle. Generally there are three parts of an effective list-column pipeline: @@ -349,7 +359,11 @@ gapminder %>% Some useful functions take an atomic vector and return a list. For example, in [strings] you learned about `stringr::str_split()` which takes a character vector and returns a list of character vectors. If you use that inside mutate, you'll get a list-column: ```{r} -df <- tibble(x1 = c("a,b,c", "d,e,f,g")) +df <- tribble( + ~x1, + "a,b,c", + "d,e,f,g" +) df %>% mutate(x2 = stringr::str_split(x1, ",")) @@ -478,12 +492,11 @@ These are described in more detail below. If you can reduce your list column to an atomic vector then it will be a regular column. For example, you can always summarise an object with it's type and length, so this code will work regardless of what sort of list-column you have: ```{r} -df <- tibble( - x = list( - letters[1:5], - 1:3, - runif(5) - ) +df <- tribble( + ~x, + letters[1:5], + 1:3, + runif(5) ) df %>% mutate( @@ -497,11 +510,10 @@ This is the same basic information that you get from the default tbl print metho Don't forget about the `map_*()` shortcuts - you can use `map_chr(x, "apple")` to extract the string stored in `apple` for each element of `x`. This is useful for pulling apart nested lists into regular columns. Use the `.null` argument to provide a value to use if the element is missing (instead of returning `NULL`): ```{r} -df <- tibble( - x = list( - list(a = 1, b = 2), - list(a = 2, c = 4) - ) +df <- tribble( + ~x, + list(a = 1, b = 2), + list(a = 2, c = 4) ) df %>% mutate( a = map_dbl(x, "a"), @@ -522,19 +534,19 @@ This means that you can't simultaneously unnest two columns that contain differe ```{r, error = TRUE} # Ok, because y and z have the same number of elements in # every row -df1 <- tibble( - x = 1:2, - y = list(c("a", "b"), "c"), - z = list(1:2, 3) +df1 <- tribble( + ~x, ~y, ~z, + 1, c("a", "b"), 1:2, + 2, "c", 3 ) df1 df1 %>% unnest(y, z) # Doesn't work because y and z have different number of elements -df2 <- tibble( - x = 1:2, - y = list("a", c("b", "c")), - z = list(1:2, 3) +df2 <- tribble( + ~x, ~y, ~z, + 1, "a", 1:2, + 2, c("b", "c"), 3 ) df2 df2 %>% unnest(y, z) diff --git a/relational-data.Rmd b/relational-data.Rmd index 2a021a9..9760bd7 100644 --- a/relational-data.Rmd +++ b/relational-data.Rmd @@ -206,8 +206,18 @@ To help you learn how joins work, I'm going to use a visual representation: knitr::include_graphics("diagrams/join-setup.png") ``` ```{r} -(x <- tibble(key = c(1, 2, 3), val_x = c("x1", "x2", "x3"))) -(y <- tibble(key = c(1, 2, 4), val_y = c("y1", "y2", "y3"))) +x <- tribble( + ~key, ~val_x, + 1, "x1", + 2, "x2", + 3, "x3" +) +y <- tribble( + ~key, ~val_y, + 1, "y1", + 2, "y2", + 4, "y3" +) ``` The coloured column represents the "key" variable: these are used to match the rows between the tables. The grey column represents the "value" column that is carried along for the ride. In these examples I'll show a single key variable and single value variable, but idea generalises in a straightforward way to multiple keys and multiple values. @@ -288,8 +298,18 @@ So far all the diagrams have assumed that the keys are unique. But that's not al and a foreign key in `x`. ```{r} - x <- tibble(key = c(1, 2, 2, 1), val_x = stringr::str_c("x", 1:4)) - y <- tibble(key = 1:2, val_y = stringr::str_c("y", 1:2)) + x <- tribble( + ~key, ~val_x, + 1, "x1", + 2, "x2", + 2, "x3", + 1, "x4" + ) + y <- tribble( + ~key, ~val_y, + 1, "y1", + 2, "y2" + ) left_join(x, y, by = "key") ``` @@ -302,8 +322,20 @@ So far all the diagrams have assumed that the keys are unique. But that's not al ``` ```{r} - x <- tibble(key = c(1, 2, 2, 3), val_x = stringr::str_c("x", 1:4)) - y <- tibble(key = c(1, 2, 2, 3), val_y = stringr::str_c("y", 1:4)) + x <- tribble( + ~key, ~val_x, + 1, "x1", + 2, "x2", + 2, "x3", + 3, "x4" + ) + y <- tribble( + ~key, ~val_y, + 1, "y1", + 2, "y2", + 2, "y3", + 3, "y4" + ) left_join(x, y, by = "key") ``` @@ -543,8 +575,16 @@ The final type of two-table verb are the set operations. Generally, I use these Given this simple data: ```{r} -(df1 <- tibble(x = 1:2, y = c(1L, 1L))) -(df2 <- tibble(x = 1:2, y = 1:2)) +df1 <- tribble( + ~x, ~y, + 1, 1, + 2, 1 +) +df2 <- tribble( + ~x, ~y, + 1, 1, + 1, 2 +) ``` The four possibilities are: diff --git a/visualize.Rmd b/visualize.Rmd index 1dfa6e7..5b3e98f 100644 --- a/visualize.Rmd +++ b/visualize.Rmd @@ -499,11 +499,12 @@ This works because every geom has a default stat; and every stat has a default g is generated by counting rows. ```{r, warning = FALSE} - demo <- tibble( - a = c("bar_1", "bar_2", "bar_3"), - b = c(20, 30, 40) + demo <- tribble( + ~a, ~b, + "bar_1", 20, + "bar_2", 30, + "bar_3", 40 ) - demo ggplot(data = demo) + geom_bar(mapping = aes(x = a, y = b), stat = "identity")