From cd6c68b5a9ae27001d68d1d6a9e457a19221d514 Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Tue, 7 Feb 2023 15:48:02 -0600 Subject: [PATCH] Minimise iteration --- databases.qmd | 1 - iteration.qmd | 27 ++++----------------------- 2 files changed, 4 insertions(+), 24 deletions(-) diff --git a/databases.qmd b/databases.qmd index fd475b8..5084a08 100644 --- a/databases.qmd +++ b/databases.qmd @@ -132,7 +132,6 @@ dbWriteTable(con, "diamonds", ggplot2::diamonds) If you're using duckdb in a real project, we highly recommend learning about `duckdb_read_csv()` and `duckdb_register_arrow()`. These give you powerful and performant ways to quickly load data directly into duckdb, without having to first load it into R. - We'll also show off a useful technique for loading multiple files into a database in @sec-save-database. ## DBI basics diff --git a/iteration.qmd b/iteration.qmd index 8bf303f..de3d086 100644 --- a/iteration.qmd +++ b/iteration.qmd @@ -108,21 +108,6 @@ Note grouping columns (`grp` here) are not included in `across()`, because they' - `where(is.POSIXct)` selects all date-time columns. - `where(is.logical)` selects all logical columns. -```{r} -df_types <- tibble( - x1 = 1:3, - x2 = runif(3), - y1 = sample(letters, 3), - y2 = c("banana", "apple", "egg") -) - -df_types |> - summarize(across(where(is.numeric), mean)) - -df_types |> - summarize(across(where(is.character), str_flatten)) -``` - Just like other selectors, you can combine these with Boolean algebra. For example, `!where(is.numeric)` selects all non-numeric columns, and `starts_with("a") & where(is.logical)` selects all logical columns whose name starts with "a". @@ -288,12 +273,10 @@ It's clear that `across()` can help to create multiple logical columns, but then So dplyr provides two variants of `across()` called `if_any()` and `if_all()`: ```{r} -df_miss |> filter(is.na(a) | is.na(b) | is.na(c) | is.na(d)) -# same as: +# same as df_miss |> filter(is.na(a) | is.na(b) | is.na(c) | is.na(d)) df_miss |> filter(if_any(a:d, is.na)) -df_miss |> filter(is.na(a) & is.na(b) & is.na(c) & is.na(d)) -# same as: +# same as df_miss |> filter(is.na(a) & is.na(b) & is.na(c) & is.na(d)) df_miss |> filter(if_all(a:d, is.na)) ``` @@ -332,11 +315,11 @@ summarize_means <- function(df, summary_vars = where(is.numeric)) { ) } diamonds |> - group_by(clarity) |> + group_by(cut) |> summarize_means() diamonds |> - group_by(clarity) |> + group_by(cut) |> summarize_means(c(carat, x:z)) ``` @@ -650,7 +633,6 @@ In more complicated cases, there might be other variables stored in the director In that case, use `set_names()` (without any arguments) to record the full path, and then use `tidyr::separate_wider_delim()` and friends to turn them into useful columns. ```{r} -# NOTE: this chapter also depends on dev tidyr (in addition to dev purrr and dev dplyr) paths |> set_names() |> map(readxl::read_excel) |> @@ -763,7 +745,6 @@ df_types <- function(df) { } df_types(starwars) -df_types(nycflights13::flights) ``` You can then apply this function to all of the files, and maybe do some pivoting to make it easier to see where the differences are.