Data Transform

《区域水环境污染数据分析实践》
Data analysis practice of regional water environment pollution

苏命、王为东
中国科学院大学资源与环境学院
中国科学院生态环境研究中心

2024-04-09

计数

flights |>
  count(origin, dest, sort = TRUE)
# A tibble: 224 × 3
   origin dest      n
   <chr>  <chr> <int>
 1 JFK    LAX   11262
 2 LGA    ATL   10263
 3 LGA    ORD    8857
 4 JFK    SFO    8204
 5 LGA    CLT    6168
 6 EWR    ORD    6100
 7 JFK    BOS    5898
 8 LGA    MIA    5781
 9 JFK    MCO    5464
10 EWR    BOS    5327
# ℹ 214 more rows

计数-练习

统计每月的航班数量。

# A tibble: 12 × 3
    year month     n
   <int> <int> <int>
 1  2013     7 29425
 2  2013     8 29327
 3  2013    10 28889
 4  2013     3 28834
 5  2013     5 28796
 6  2013     4 28330
 7  2013     6 28243
 8  2013    12 28135
 9  2013     9 27574
10  2013    11 27268
11  2013     1 27004
12  2013     2 24951

计算新变量

flights |> 
  mutate(
    gain = dep_delay - arr_delay,
    speed = distance / air_time * 60
  )
# A tibble: 336,776 × 21
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      517            515         2      830            819
 2  2013     1     1      533            529         4      850            830
 3  2013     1     1      542            540         2      923            850
 4  2013     1     1      544            545        -1     1004           1022
 5  2013     1     1      554            600        -6      812            837
 6  2013     1     1      554            558        -4      740            728
 7  2013     1     1      555            600        -5      913            854
 8  2013     1     1      557            600        -3      709            723
 9  2013     1     1      557            600        -3      838            846
10  2013     1     1      558            600        -2      753            745
# ℹ 336,766 more rows
# ℹ 13 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>, gain <dbl>, speed <dbl>

计算新变量

flights |> 
  mutate(
    gain = dep_delay - arr_delay,
    speed = distance / air_time * 60,
    .before = 1
  )
# A tibble: 336,776 × 21
    gain speed  year month   day dep_time sched_dep_time dep_delay arr_time
   <dbl> <dbl> <int> <int> <int>    <int>          <int>     <dbl>    <int>
 1    -9  370.  2013     1     1      517            515         2      830
 2   -16  374.  2013     1     1      533            529         4      850
 3   -31  408.  2013     1     1      542            540         2      923
 4    17  517.  2013     1     1      544            545        -1     1004
 5    19  394.  2013     1     1      554            600        -6      812
 6   -16  288.  2013     1     1      554            558        -4      740
 7   -24  404.  2013     1     1      555            600        -5      913
 8    11  259.  2013     1     1      557            600        -3      709
 9     5  405.  2013     1     1      557            600        -3      838
10   -10  319.  2013     1     1      558            600        -2      753
# ℹ 336,766 more rows
# ℹ 12 more variables: sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
#   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
#   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

计算新变量

flights |> 
  mutate(
    gain = dep_delay - arr_delay,
    speed = distance / air_time * 60,
    .after = day
  )
# A tibble: 336,776 × 21
    year month   day  gain speed dep_time sched_dep_time dep_delay arr_time
   <int> <int> <int> <dbl> <dbl>    <int>          <int>     <dbl>    <int>
 1  2013     1     1    -9  370.      517            515         2      830
 2  2013     1     1   -16  374.      533            529         4      850
 3  2013     1     1   -31  408.      542            540         2      923
 4  2013     1     1    17  517.      544            545        -1     1004
 5  2013     1     1    19  394.      554            600        -6      812
 6  2013     1     1   -16  288.      554            558        -4      740
 7  2013     1     1   -24  404.      555            600        -5      913
 8  2013     1     1    11  259.      557            600        -3      709
 9  2013     1     1     5  405.      557            600        -3      838
10  2013     1     1   -10  319.      558            600        -2      753
# ℹ 336,766 more rows
# ℹ 12 more variables: sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
#   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
#   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

计算新变量

flights |> 
  mutate(
    gain = dep_delay - arr_delay,
    hours = air_time / 60,
    gain_per_hour = gain / hours,
    .keep = "used"
  )
# A tibble: 336,776 × 6
   dep_delay arr_delay air_time  gain hours gain_per_hour
       <dbl>     <dbl>    <dbl> <dbl> <dbl>         <dbl>
 1         2        11      227    -9 3.78          -2.38
 2         4        20      227   -16 3.78          -4.23
 3         2        33      160   -31 2.67         -11.6 
 4        -1       -18      183    17 3.05           5.57
 5        -6       -25      116    19 1.93           9.83
 6        -4        12      150   -16 2.5           -6.4 
 7        -5        19      158   -24 2.63          -9.11
 8        -3       -14       53    11 0.883         12.5 
 9        -3        -8      140     5 2.33           2.14
10        -2         8      138   -10 2.3           -4.35
# ℹ 336,766 more rows

列排序

flights |> 
  relocate(time_hour, air_time)
# A tibble: 336,776 × 19
   time_hour           air_time  year month   day dep_time sched_dep_time
   <dttm>                 <dbl> <int> <int> <int>    <int>          <int>
 1 2013-01-01 05:00:00      227  2013     1     1      517            515
 2 2013-01-01 05:00:00      227  2013     1     1      533            529
 3 2013-01-01 05:00:00      160  2013     1     1      542            540
 4 2013-01-01 05:00:00      183  2013     1     1      544            545
 5 2013-01-01 06:00:00      116  2013     1     1      554            600
 6 2013-01-01 05:00:00      150  2013     1     1      554            558
 7 2013-01-01 06:00:00      158  2013     1     1      555            600
 8 2013-01-01 06:00:00       53  2013     1     1      557            600
 9 2013-01-01 06:00:00      140  2013     1     1      557            600
10 2013-01-01 06:00:00      138  2013     1     1      558            600
# ℹ 336,766 more rows
# ℹ 12 more variables: dep_delay <dbl>, arr_time <int>, sched_arr_time <int>,
#   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,
#   dest <chr>, distance <dbl>, hour <dbl>, minute <dbl>

列排序

flights |> 
  relocate(year:dep_time, .after = time_hour)
flights |> 
  relocate(starts_with("arr"), .before = dep_time)
flights |> 
  select(starts_with("arr"), everything())

练习

计算目的地为IAH,按飞行速度排序的表格,保留year:day, dep_time, carrier, flight与speed列。

flights |> 
  filter(dest == "IAH") |> 
  mutate(speed = distance / air_time * 60) |> 
  select(year:day, dep_time, carrier, flight, speed) |> 
  arrange(desc(speed))
# A tibble: 7,198 × 7
    year month   day dep_time carrier flight speed
   <int> <int> <int>    <int> <chr>    <int> <dbl>
 1  2013     7     9      707 UA         226  522.
 2  2013     8    27     1850 UA        1128  521.
 3  2013     8    28      902 UA        1711  519.
 4  2013     8    28     2122 UA        1022  519.
 5  2013     6    11     1628 UA        1178  515.
 6  2013     8    27     1017 UA         333  515.
 7  2013     8    27     1205 UA        1421  515.
 8  2013     8    27     1758 UA         302  515.
 9  2013     9    27      521 UA         252  515.
10  2013     8    28      625 UA         559  515.
# ℹ 7,188 more rows

练习

计算目的地为IAH,按飞行速度排序的表格,保留year:day, dep_time, carrier, flight与speed列。

flights1 <- filter(flights, dest == "IAH")
flights2 <- mutate(flights1, speed = distance / air_time * 60)
flights3 <- select(flights2, year:day, dep_time, carrier, flight, speed)
arrange(flights3, desc(speed))

练习

计算目的地为IAH,按飞行速度排序的表格,保留year:day, dep_time, carrier, flight与speed列。

flights |> 
  filter(dest == "IAH") |> 
  mutate(speed = distance / air_time * 60) |> 
  select(year:day, dep_time, carrier, flight, speed) |> 
  arrange(desc(speed))
# A tibble: 7,198 × 7
    year month   day dep_time carrier flight speed
   <int> <int> <int>    <int> <chr>    <int> <dbl>
 1  2013     7     9      707 UA         226  522.
 2  2013     8    27     1850 UA        1128  521.
 3  2013     8    28      902 UA        1711  519.
 4  2013     8    28     2122 UA        1022  519.
 5  2013     6    11     1628 UA        1178  515.
 6  2013     8    27     1017 UA         333  515.
 7  2013     8    27     1205 UA        1421  515.
 8  2013     8    27     1758 UA         302  515.
 9  2013     9    27      521 UA         252  515.
10  2013     8    28      625 UA         559  515.
# ℹ 7,188 more rows

分组统计

library(tidyverse)

mtcars %>% 
  group_by(cyl) %>%
  summarize(n = n())
# A tibble: 3 × 2
    cyl     n
  <dbl> <int>
1     4    11
2     6     7
3     8    14

分组统计

flights |> 
  group_by(month)
# A tibble: 336,776 × 19
# Groups:   month [12]
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      517            515         2      830            819
 2  2013     1     1      533            529         4      850            830
 3  2013     1     1      542            540         2      923            850
 4  2013     1     1      544            545        -1     1004           1022
 5  2013     1     1      554            600        -6      812            837
 6  2013     1     1      554            558        -4      740            728
 7  2013     1     1      555            600        -5      913            854
 8  2013     1     1      557            600        -3      709            723
 9  2013     1     1      557            600        -3      838            846
10  2013     1     1      558            600        -2      753            745
# ℹ 336,766 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

分组统计

flights |> 
  group_by(month) |> 
  summarize(
    avg_delay = mean(dep_delay)
  )
# A tibble: 12 × 2
   month avg_delay
   <int>     <dbl>
 1     1        NA
 2     2        NA
 3     3        NA
 4     4        NA
 5     5        NA
 6     6        NA
 7     7        NA
 8     8        NA
 9     9        NA
10    10        NA
11    11        NA
12    12        NA

分组统计

flights |> 
  group_by(month) |> 
  summarize(
    avg_delay = mean(dep_delay, na.rm = TRUE)
  )
# A tibble: 12 × 2
   month avg_delay
   <int>     <dbl>
 1     1     10.0 
 2     2     10.8 
 3     3     13.2 
 4     4     13.9 
 5     5     13.0 
 6     6     20.8 
 7     7     21.7 
 8     8     12.6 
 9     9      6.72
10    10      6.24
11    11      5.44
12    12     16.6 

分组统计

flights |> 
  group_by(month) |> 
  summarize(
    avg_delay = mean(dep_delay, na.rm = TRUE), 
    n = n()
  )
# A tibble: 12 × 3
   month avg_delay     n
   <int>     <dbl> <int>
 1     1     10.0  27004
 2     2     10.8  24951
 3     3     13.2  28834
 4     4     13.9  28330
 5     5     13.0  28796
 6     6     20.8  28243
 7     7     21.7  29425
 8     8     12.6  29327
 9     9      6.72 27574
10    10      6.24 28889
11    11      5.44 27268
12    12     16.6  28135

分组统计

flights |> 
  group_by(dest) |> 
  slice_max(arr_delay, n = 1) |>
  relocate(dest)
# A tibble: 108 × 19
# Groups:   dest [105]
   dest   year month   day dep_time sched_dep_time dep_delay arr_time
   <chr> <int> <int> <int>    <int>          <int>     <dbl>    <int>
 1 ABQ    2013     7    22     2145           2007        98      132
 2 ACK    2013     7    23     1139            800       219     1250
 3 ALB    2013     1    25      123           2000       323      229
 4 ANC    2013     8    17     1740           1625        75     2042
 5 ATL    2013     7    22     2257            759       898      121
 6 AUS    2013     7    10     2056           1505       351     2347
 7 AVL    2013     8    13     1156            832       204     1417
 8 BDL    2013     2    21     1728           1316       252     1839
 9 BGR    2013    12     1     1504           1056       248     1628
10 BHM    2013     4    10       25           1900       325      136
# ℹ 98 more rows
# ℹ 11 more variables: sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
#   flight <int>, tailnum <chr>, origin <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

分组统计

flights |>
  filter(dest == "IAH") |> 
  group_by(year, month, day) |> 
  summarize(
    arr_delay = mean(arr_delay, na.rm = TRUE)
  )
# A tibble: 365 × 4
# Groups:   year, month [12]
    year month   day arr_delay
   <int> <int> <int>     <dbl>
 1  2013     1     1     17.8 
 2  2013     1     2      7   
 3  2013     1     3     18.3 
 4  2013     1     4     -3.2 
 5  2013     1     5     20.2 
 6  2013     1     6      9.28
 7  2013     1     7     -7.74
 8  2013     1     8      7.79
 9  2013     1     9     18.1 
10  2013     1    10      6.68
# ℹ 355 more rows

分组

daily <- flights |>  
  group_by(year, month, day)
daily
# A tibble: 336,776 × 19
# Groups:   year, month, day [365]
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      517            515         2      830            819
 2  2013     1     1      533            529         4      850            830
 3  2013     1     1      542            540         2      923            850
 4  2013     1     1      544            545        -1     1004           1022
 5  2013     1     1      554            600        -6      812            837
 6  2013     1     1      554            558        -4      740            728
 7  2013     1     1      555            600        -5      913            854
 8  2013     1     1      557            600        -3      709            723
 9  2013     1     1      557            600        -3      838            846
10  2013     1     1      558            600        -2      753            745
# ℹ 336,766 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

分组统计

daily_flights <- daily |> 
  summarize(n = n())

分组统计

daily_flights <- daily |> 
  summarize(
    n = n(), 
    .groups = "drop_last"
  )

删除分组

daily |> ungroup()
# A tibble: 336,776 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      517            515         2      830            819
 2  2013     1     1      533            529         4      850            830
 3  2013     1     1      542            540         2      923            850
 4  2013     1     1      544            545        -1     1004           1022
 5  2013     1     1      554            600        -6      812            837
 6  2013     1     1      554            558        -4      740            728
 7  2013     1     1      555            600        -5      913            854
 8  2013     1     1      557            600        -3      709            723
 9  2013     1     1      557            600        -3      838            846
10  2013     1     1      558            600        -2      753            745
# ℹ 336,766 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

删除分组

daily |> 
  ungroup() |>
  summarize(
    avg_delay = mean(dep_delay, na.rm = TRUE), 
    flights = n()
  )
# A tibble: 1 × 2
  avg_delay flights
      <dbl>   <int>
1      12.6  336776

分组统计

flights |> 
  summarize(
    delay = mean(dep_delay, na.rm = TRUE), 
    n = n(),
    .by = month
  )
# A tibble: 12 × 3
   month delay     n
   <int> <dbl> <int>
 1     1 10.0  27004
 2    10  6.24 28889
 3    11  5.44 27268
 4    12 16.6  28135
 5     2 10.8  24951
 6     3 13.2  28834
 7     4 13.9  28330
 8     5 13.0  28796
 9     6 20.8  28243
10     7 21.7  29425
11     8 12.6  29327
12     9  6.72 27574

分组统计

flights |> 
  summarize(
    delay = mean(dep_delay, na.rm = TRUE), 
    n = n(),
    .by = c(origin, dest)
  )
# A tibble: 224 × 4
   origin dest  delay     n
   <chr>  <chr> <dbl> <int>
 1 EWR    IAH   11.8   3973
 2 LGA    IAH    9.06  2951
 3 JFK    MIA    9.34  3314
 4 JFK    BQN    6.67   599
 5 LGA    ATL   11.4  10263
 6 EWR    ORD   14.6   6100
 7 EWR    FLL   13.5   3793
 8 LGA    IAD   16.7   1803
 9 JFK    MCO   10.6   5464
10 LGA    ORD   10.7   8857
# ℹ 214 more rows

练习

df <- tibble(
  x = 1:5,
  y = c("a", "b", "a", "a", "b"),
  z = c("K", "K", "L", "L", "K")
)
df
# A tibble: 5 × 3
      x y     z    
  <int> <chr> <chr>
1     1 a     K    
2     2 b     K    
3     3 a     L    
4     4 a     L    
5     5 b     K    
df |> arrange(y)

练习

# A tibble: 5 × 3
      x y     z    
  <int> <chr> <chr>
1     1 a     K    
2     2 b     K    
3     3 a     L    
4     4 a     L    
5     5 b     K    
df |>
  group_by(y) |>
  summarize(mean_x = mean(x))

练习

# A tibble: 5 × 3
      x y     z    
  <int> <chr> <chr>
1     1 a     K    
2     2 b     K    
3     3 a     L    
4     4 a     L    
5     5 b     K    
df |>
  group_by(y, z) |>
  summarize(mean_x = mean(x))

练习

# A tibble: 5 × 3
      x y     z    
  <int> <chr> <chr>
1     1 a     K    
2     2 b     K    
3     3 a     L    
4     4 a     L    
5     5 b     K    
df |>
  group_by(y, z) |>
  summarize(mean_x = mean(x), .groups = "drop")

练习

# A tibble: 5 × 3
      x y     z    
  <int> <chr> <chr>
1     1 a     K    
2     2 b     K    
3     3 a     L    
4     4 a     L    
5     5 b     K    
df |>
  group_by(y, z) |>
  summarize(mean_x = mean(x))

df |>
  group_by(y, z) |>
  mutate(mean_x = mean(x))

练习

  • 计算不同采样点的平均CO浓度、最大CO浓度、最小CO浓度、中位数CO浓度(CO_mg/m3)。
  • 计算各小时全国的平均CO浓度、最大CO浓度、最小CO浓度、中位数CO浓度(CO_mg/m3)。
  • 计算不同采样点各小时的平均CO浓度、最大CO浓度、最小CO浓度、中位数CO浓度(CO_mg/m3)。
  • 计算各采样点中CO浓度小于全国平均CO浓度的占比。
  • 找出全国各采样点中CO浓度小于全国平均CO浓度的占比最高的10个采样点。
airqualitydf <- readxl::read_xlsx("../../data/airquality.xlsx",
  sheet = 2)

练习

按月统计dep_delay最大的3个航班的航班号(flight),用逗号连接。

year month flight
2013 1 HA51, MQ3695, MQ3944
2013 2 F9835, DL2319, DL575
2013 3 DL2119, DL2363, FL361
2013 4 DL2391, DL1435, AA257
2013 5 MQ3744, AA257, AA753
2013 6 MQ3535, DL2007, AA2019
2013 7 MQ3075, DL2047, VX187
2013 8 EV4978, DL843, DL1373
2013 9 AA177, DL2131, FL350
2013 10 DL502, DL2347, YV2693
2013 11 DL2042, AA1697, AA1139
2013 12 AA172, DL1223, AA172

数据变形示意图

billboard
# A tibble: 317 × 79
   artist     track date.entered   wk1   wk2   wk3   wk4   wk5   wk6   wk7   wk8
   <chr>      <chr> <date>       <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 2 Pac      Baby… 2000-02-26      87    82    72    77    87    94    99    NA
 2 2Ge+her    The … 2000-09-02      91    87    92    NA    NA    NA    NA    NA
 3 3 Doors D… Kryp… 2000-04-08      81    70    68    67    66    57    54    53
 4 3 Doors D… Loser 2000-10-21      76    76    72    69    67    65    55    59
 5 504 Boyz   Wobb… 2000-04-15      57    34    25    17    17    31    36    49
 6 98^0       Give… 2000-08-19      51    39    34    26    26    19     2     2
 7 A*Teens    Danc… 2000-07-08      97    97    96    95   100    NA    NA    NA
 8 Aaliyah    I Do… 2000-01-29      84    62    51    41    38    35    35    38
 9 Aaliyah    Try … 2000-03-18      59    53    38    28    21    18    16    14
10 Adams, Yo… Open… 2000-08-26      76    76    74    69    68    67    61    58
# ℹ 307 more rows
# ℹ 68 more variables: wk9 <dbl>, wk10 <dbl>, wk11 <dbl>, wk12 <dbl>,
#   wk13 <dbl>, wk14 <dbl>, wk15 <dbl>, wk16 <dbl>, wk17 <dbl>, wk18 <dbl>,
#   wk19 <dbl>, wk20 <dbl>, wk21 <dbl>, wk22 <dbl>, wk23 <dbl>, wk24 <dbl>,
#   wk25 <dbl>, wk26 <dbl>, wk27 <dbl>, wk28 <dbl>, wk29 <dbl>, wk30 <dbl>,
#   wk31 <dbl>, wk32 <dbl>, wk33 <dbl>, wk34 <dbl>, wk35 <dbl>, wk36 <dbl>,
#   wk37 <dbl>, wk38 <dbl>, wk39 <dbl>, wk40 <dbl>, wk41 <dbl>, wk42 <dbl>, …
knitr::include_graphics("../../image/tidy-data/variables.png", dpi = 270)

数据变形

billboard |> 
  pivot_longer(
    cols = starts_with("wk"), 
    names_to = "week", 
    values_to = "rank",
    values_drop_na = TRUE
  )
# A tibble: 5,307 × 5
   artist  track                   date.entered week   rank
   <chr>   <chr>                   <date>       <chr> <dbl>
 1 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk1      87
 2 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk2      82
 3 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk3      72
 4 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk4      77
 5 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk5      87
 6 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk6      94
 7 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk7      99
 8 2Ge+her The Hardest Part Of ... 2000-09-02   wk1      91
 9 2Ge+her The Hardest Part Of ... 2000-09-02   wk2      87
10 2Ge+her The Hardest Part Of ... 2000-09-02   wk3      92
# ℹ 5,297 more rows

数据变形

billboard_longer <- billboard |> 
  pivot_longer(
    cols = starts_with("wk"), 
    names_to = "week", 
    values_to = "rank",
    values_drop_na = TRUE
  ) |> 
  mutate(
    week = parse_number(week)
  )
billboard_longer
# A tibble: 5,307 × 5
   artist  track                   date.entered  week  rank
   <chr>   <chr>                   <date>       <dbl> <dbl>
 1 2 Pac   Baby Don't Cry (Keep... 2000-02-26       1    87
 2 2 Pac   Baby Don't Cry (Keep... 2000-02-26       2    82
 3 2 Pac   Baby Don't Cry (Keep... 2000-02-26       3    72
 4 2 Pac   Baby Don't Cry (Keep... 2000-02-26       4    77
 5 2 Pac   Baby Don't Cry (Keep... 2000-02-26       5    87
 6 2 Pac   Baby Don't Cry (Keep... 2000-02-26       6    94
 7 2 Pac   Baby Don't Cry (Keep... 2000-02-26       7    99
 8 2Ge+her The Hardest Part Of ... 2000-09-02       1    91
 9 2Ge+her The Hardest Part Of ... 2000-09-02       2    87
10 2Ge+her The Hardest Part Of ... 2000-09-02       3    92
# ℹ 5,297 more rows

练习

# A tibble: 3 × 3
  id      bp1   bp2
  <chr> <dbl> <dbl>
1 A       100   120
2 B       140   115
3 C       120   125

将以上数据(df)转换为如下形式。

# A tibble: 6 × 3
  id    measurement value
  <chr> <chr>       <dbl>
1 A     bp1           100
2 A     bp2           120
3 B     bp1           140
4 B     bp2           115
5 C     bp1           120
6 C     bp2           125

练习

请转换如下iris数据。

# A tibble: 3 × 5
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
         <dbl>       <dbl>        <dbl>       <dbl> <fct>  
1          5.1         3.5          1.4         0.2 setosa 
2          4.9         3            1.4         0.2 setosa 
3          4.7         3.2          1.3         0.2 setosa 
转为如下形式:
# A tibble: 6 × 3
  Species flower_attr  attr_value
  <fct>   <chr>             <dbl>
1 setosa  Sepal.Length        5.1
2 setosa  Sepal.Width         3.5
3 setosa  Petal.Length        1.4
4 setosa  Petal.Width         0.2
5 setosa  Sepal.Length        4.9
6 setosa  Sepal.Width         3  

数据变形示意图2

who2
# A tibble: 7,240 × 58
   country      year sp_m_014 sp_m_1524 sp_m_2534 sp_m_3544 sp_m_4554 sp_m_5564
   <chr>       <dbl>    <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
 1 Afghanistan  1980       NA        NA        NA        NA        NA        NA
 2 Afghanistan  1981       NA        NA        NA        NA        NA        NA
 3 Afghanistan  1982       NA        NA        NA        NA        NA        NA
 4 Afghanistan  1983       NA        NA        NA        NA        NA        NA
 5 Afghanistan  1984       NA        NA        NA        NA        NA        NA
 6 Afghanistan  1985       NA        NA        NA        NA        NA        NA
 7 Afghanistan  1986       NA        NA        NA        NA        NA        NA
 8 Afghanistan  1987       NA        NA        NA        NA        NA        NA
 9 Afghanistan  1988       NA        NA        NA        NA        NA        NA
10 Afghanistan  1989       NA        NA        NA        NA        NA        NA
# ℹ 7,230 more rows
# ℹ 50 more variables: sp_m_65 <dbl>, sp_f_014 <dbl>, sp_f_1524 <dbl>,
#   sp_f_2534 <dbl>, sp_f_3544 <dbl>, sp_f_4554 <dbl>, sp_f_5564 <dbl>,
#   sp_f_65 <dbl>, sn_m_014 <dbl>, sn_m_1524 <dbl>, sn_m_2534 <dbl>,
#   sn_m_3544 <dbl>, sn_m_4554 <dbl>, sn_m_5564 <dbl>, sn_m_65 <dbl>,
#   sn_f_014 <dbl>, sn_f_1524 <dbl>, sn_f_2534 <dbl>, sn_f_3544 <dbl>,
#   sn_f_4554 <dbl>, sn_f_5564 <dbl>, sn_f_65 <dbl>, ep_m_014 <dbl>, …
knitr::include_graphics("../../image/tidy-data/multiple-names.png", dpi = 270)

数据变形

who2 |> 
  pivot_longer(
    cols = !(country:year),
    names_to = c("diagnosis", "gender", "age"), 
    names_sep = "_",
    values_to = "count"
  )
# A tibble: 405,440 × 6
   country      year diagnosis gender age   count
   <chr>       <dbl> <chr>     <chr>  <chr> <dbl>
 1 Afghanistan  1980 sp        m      014      NA
 2 Afghanistan  1980 sp        m      1524     NA
 3 Afghanistan  1980 sp        m      2534     NA
 4 Afghanistan  1980 sp        m      3544     NA
 5 Afghanistan  1980 sp        m      4554     NA
 6 Afghanistan  1980 sp        m      5564     NA
 7 Afghanistan  1980 sp        m      65       NA
 8 Afghanistan  1980 sp        f      014      NA
 9 Afghanistan  1980 sp        f      1524     NA
10 Afghanistan  1980 sp        f      2534     NA
# ℹ 405,430 more rows

数据变形示意图

household
# A tibble: 5 × 5
  family dob_child1 dob_child2 name_child1 name_child2
   <int> <date>     <date>     <chr>       <chr>      
1      1 1998-11-26 2000-01-29 Susan       Jose       
2      2 1996-06-22 NA         Mark        <NA>       
3      3 2002-07-11 2004-04-05 Sam         Seth       
4      4 2004-10-10 2009-08-27 Craig       Khai       
5      5 2000-12-05 2005-02-28 Parker      Gracie     
knitr::include_graphics("../../image/tidy-data/names-and-values.png", dpi = 270)

数据变形

household |> 
  pivot_longer(
    cols = !family, 
    names_to = c(".value", "child"), 
    names_sep = "_", 
    values_drop_na = TRUE
  )
# A tibble: 9 × 4
  family child  dob        name  
   <int> <chr>  <date>     <chr> 
1      1 child1 1998-11-26 Susan 
2      1 child2 2000-01-29 Jose  
3      2 child1 1996-06-22 Mark  
4      3 child1 2002-07-11 Sam   
5      3 child2 2004-04-05 Seth  
6      4 child1 2004-10-10 Craig 
7      4 child2 2009-08-27 Khai  
8      5 child1 2000-12-05 Parker
9      5 child2 2005-02-28 Gracie

查看数据

cms_patient_experience
# A tibble: 500 × 5
   org_pac_id org_nm                           measure_cd measure_title prf_rate
   <chr>      <chr>                            <chr>      <chr>            <dbl>
 1 0446157747 USC CARE MEDICAL GROUP INC       CAHPS_GRP… CAHPS for MI…       63
 2 0446157747 USC CARE MEDICAL GROUP INC       CAHPS_GRP… CAHPS for MI…       87
 3 0446157747 USC CARE MEDICAL GROUP INC       CAHPS_GRP… CAHPS for MI…       86
 4 0446157747 USC CARE MEDICAL GROUP INC       CAHPS_GRP… CAHPS for MI…       57
 5 0446157747 USC CARE MEDICAL GROUP INC       CAHPS_GRP… CAHPS for MI…       85
 6 0446157747 USC CARE MEDICAL GROUP INC       CAHPS_GRP… CAHPS for MI…       24
 7 0446162697 ASSOCIATION OF UNIVERSITY PHYSI… CAHPS_GRP… CAHPS for MI…       59
 8 0446162697 ASSOCIATION OF UNIVERSITY PHYSI… CAHPS_GRP… CAHPS for MI…       85
 9 0446162697 ASSOCIATION OF UNIVERSITY PHYSI… CAHPS_GRP… CAHPS for MI…       83
10 0446162697 ASSOCIATION OF UNIVERSITY PHYSI… CAHPS_GRP… CAHPS for MI…       63
# ℹ 490 more rows

查看数据

cms_patient_experience |> 
  distinct(measure_cd, measure_title)
# A tibble: 6 × 2
  measure_cd   measure_title                                                    
  <chr>        <chr>                                                            
1 CAHPS_GRP_1  CAHPS for MIPS SSM: Getting Timely Care, Appointments, and Infor…
2 CAHPS_GRP_2  CAHPS for MIPS SSM: How Well Providers Communicate               
3 CAHPS_GRP_3  CAHPS for MIPS SSM: Patient's Rating of Provider                 
4 CAHPS_GRP_5  CAHPS for MIPS SSM: Health Promotion and Education               
5 CAHPS_GRP_8  CAHPS for MIPS SSM: Courteous and Helpful Office Staff           
6 CAHPS_GRP_12 CAHPS for MIPS SSM: Stewardship of Patient Resources             

数据变形(变宽)

cms_patient_experience |> 
  pivot_wider(
    names_from = measure_cd,
    values_from = prf_rate
  )
# A tibble: 500 × 9
   org_pac_id org_nm           measure_title CAHPS_GRP_1 CAHPS_GRP_2 CAHPS_GRP_3
   <chr>      <chr>            <chr>               <dbl>       <dbl>       <dbl>
 1 0446157747 USC CARE MEDICA… CAHPS for MI…          63          NA          NA
 2 0446157747 USC CARE MEDICA… CAHPS for MI…          NA          87          NA
 3 0446157747 USC CARE MEDICA… CAHPS for MI…          NA          NA          86
 4 0446157747 USC CARE MEDICA… CAHPS for MI…          NA          NA          NA
 5 0446157747 USC CARE MEDICA… CAHPS for MI…          NA          NA          NA
 6 0446157747 USC CARE MEDICA… CAHPS for MI…          NA          NA          NA
 7 0446162697 ASSOCIATION OF … CAHPS for MI…          59          NA          NA
 8 0446162697 ASSOCIATION OF … CAHPS for MI…          NA          85          NA
 9 0446162697 ASSOCIATION OF … CAHPS for MI…          NA          NA          83
10 0446162697 ASSOCIATION OF … CAHPS for MI…          NA          NA          NA
# ℹ 490 more rows
# ℹ 3 more variables: CAHPS_GRP_5 <dbl>, CAHPS_GRP_8 <dbl>, CAHPS_GRP_12 <dbl>

数据变形(变宽)

cms_patient_experience |> 
  pivot_wider(
    id_cols = starts_with("org"),
    names_from = measure_cd,
    values_from = prf_rate
  )
# A tibble: 95 × 8
   org_pac_id org_nm CAHPS_GRP_1 CAHPS_GRP_2 CAHPS_GRP_3 CAHPS_GRP_5 CAHPS_GRP_8
   <chr>      <chr>        <dbl>       <dbl>       <dbl>       <dbl>       <dbl>
 1 0446157747 USC C…          63          87          86          57          85
 2 0446162697 ASSOC…          59          85          83          63          88
 3 0547164295 BEAVE…          49          NA          75          44          73
 4 0749333730 CAPE …          67          84          85          65          82
 5 0840104360 ALLIA…          66          87          87          64          87
 6 0840109864 REX H…          73          87          84          67          91
 7 0840513552 SCL H…          58          83          76          58          78
 8 0941545784 GRITM…          46          86          81          54          NA
 9 1052612785 COMMU…          65          84          80          58          87
10 1254237779 OUR L…          61          NA          NA          65          NA
# ℹ 85 more rows
# ℹ 1 more variable: CAHPS_GRP_12 <dbl>

练习

df <- tribble(
  ~id, ~measurement, ~value,
  "A",        "bp1",    100,
  "B",        "bp1",    140,
  "B",        "bp2",    115, 
  "A",        "bp2",    120,
  "A",        "bp3",    105
)

变形成如下形式:

# A tibble: 2 × 4
  id      bp1   bp2   bp3
  <chr> <dbl> <dbl> <dbl>
1 A       100   120   105
2 B       140   115    NA

练习:变宽

df <- tribble(
  ~id, ~measurement, ~value,
  "A",        "bp1",    100,
  "A",        "bp1",    102,
  "A",        "bp2",    120,
  "B",        "bp1",    140, 
  "B",        "bp2",    115
)

练习

df |>
  pivot_wider(
    names_from = measurement,
    values_from = value
  )
# A tibble: 2 × 3
  id    bp1       bp2      
  <chr> <list>    <list>   
1 A     <dbl [2]> <dbl [1]>
2 B     <dbl [1]> <dbl [1]>

练习

df |> 
  group_by(id, measurement) |> 
  summarize(n = n(), .groups = "drop") |> 
  filter(n > 1)
# A tibble: 1 × 3
  id    measurement     n
  <chr> <chr>       <int>
1 A     bp1             2

nest,套嵌数据框

# A tibble: 6 × 3
      x     y     z
  <dbl> <int> <int>
1     1     1     6
2     1     2     5
3     1     3     4
4     2     4     3
5     2     5     2
6     3     6     1
df %>% nest(data = c(y, z))
# A tibble: 3 × 2
      x data            
  <dbl> <list>          
1     1 <tibble [3 × 2]>
2     2 <tibble [2 × 2]>
3     3 <tibble [1 × 2]>

nest,套嵌数据框

Specify variables to nest by (rather than variables to nest) using .by

df %>% nest(.by = x)
# A tibble: 3 × 2
      x data            
  <dbl> <list>          
1     1 <tibble [3 × 2]>
2     2 <tibble [2 × 2]>
3     3 <tibble [1 × 2]>

nest,套嵌数据框

In this case, since ... isn’t used you can specify the resulting column name with .key

df %>% nest(.by = x, .key = "cols")
# A tibble: 3 × 2
      x cols            
  <dbl> <list>          
1     1 <tibble [3 × 2]>
2     2 <tibble [2 × 2]>
3     3 <tibble [1 × 2]>

nest,套嵌数据框

Use tidyselect syntax and helpers, just like in dplyr::select()

df %>% nest(data = any_of(c("y", "z")))
# A tibble: 3 × 2
      x data            
  <dbl> <list>          
1     1 <tibble [3 × 2]>
2     2 <tibble [2 × 2]>
3     3 <tibble [1 × 2]>

nest,套嵌数据框

... and .by can be used together to drop columns you no longer need, or to include the columns you are nesting by in the inner data frame too. This drops z:

df %>% nest(data = y, .by = x)
# A tibble: 3 × 2
      x data            
  <dbl> <list>          
1     1 <tibble [3 × 1]>
2     2 <tibble [2 × 1]>
3     3 <tibble [1 × 1]>

nest,套嵌数据框

This includes x in the inner data frame:

df %>% nest(data = everything(), .by = x)
# A tibble: 3 × 2
      x data            
  <dbl> <list>          
1     1 <tibble [3 × 3]>
2     2 <tibble [2 × 3]>
3     3 <tibble [1 × 3]>

nest,套嵌数据框

Multiple nesting structures can be specified at once

iris %>%
  nest(petal = starts_with("Petal"), sepal = starts_with("Sepal"))
# A tibble: 3 × 3
  Species    petal             sepal            
  <fct>      <list>            <list>           
1 setosa     <tibble [50 × 2]> <tibble [50 × 2]>
2 versicolor <tibble [50 × 2]> <tibble [50 × 2]>
3 virginica  <tibble [50 × 2]> <tibble [50 × 2]>

nest,套嵌数据框

iris %>%
  nest(width = contains("Width"), length = contains("Length"))
# A tibble: 3 × 3
  Species    width             length           
  <fct>      <list>            <list>           
1 setosa     <tibble [50 × 2]> <tibble [50 × 2]>
2 versicolor <tibble [50 × 2]> <tibble [50 × 2]>
3 virginica  <tibble [50 × 2]> <tibble [50 × 2]>

nest,套嵌数据框

Nesting a grouped data frame nests all variables apart from the group vars

fish_encounters
# A tibble: 114 × 3
   fish  station  seen
   <fct> <fct>   <int>
 1 4842  Release     1
 2 4842  I80_1       1
 3 4842  Lisbon      1
 4 4842  Rstr        1
 5 4842  Base_TD     1
 6 4842  BCE         1
 7 4842  BCW         1
 8 4842  BCE2        1
 9 4842  BCW2        1
10 4842  MAE         1
# ℹ 104 more rows
fish_encounters %>%
  dplyr::group_by(fish) %>%
  nest()
# A tibble: 19 × 2
# Groups:   fish [19]
   fish  data             
   <fct> <list>           
 1 4842  <tibble [11 × 2]>
 2 4843  <tibble [11 × 2]>
 3 4844  <tibble [11 × 2]>
 4 4845  <tibble [5 × 2]> 
 5 4847  <tibble [3 × 2]> 
 6 4848  <tibble [4 × 2]> 
 7 4849  <tibble [2 × 2]> 
 8 4850  <tibble [6 × 2]> 
 9 4851  <tibble [2 × 2]> 
10 4854  <tibble [2 × 2]> 
11 4855  <tibble [5 × 2]> 
12 4857  <tibble [9 × 2]> 
13 4858  <tibble [11 × 2]>
14 4859  <tibble [5 × 2]> 
15 4861  <tibble [11 × 2]>
16 4862  <tibble [9 × 2]> 
17 4863  <tibble [2 × 2]> 
18 4864  <tibble [2 × 2]> 
19 4865  <tibble [3 × 2]> 

nest,套嵌数据框

That is similar to nest(.by = ), except here the result isn’t grouped

fish_encounters %>%
  nest(.by = fish)
# A tibble: 19 × 2
   fish  data             
   <fct> <list>           
 1 4842  <tibble [11 × 2]>
 2 4843  <tibble [11 × 2]>
 3 4844  <tibble [11 × 2]>
 4 4845  <tibble [5 × 2]> 
 5 4847  <tibble [3 × 2]> 
 6 4848  <tibble [4 × 2]> 
 7 4849  <tibble [2 × 2]> 
 8 4850  <tibble [6 × 2]> 
 9 4851  <tibble [2 × 2]> 
10 4854  <tibble [2 × 2]> 
11 4855  <tibble [5 × 2]> 
12 4857  <tibble [9 × 2]> 
13 4858  <tibble [11 × 2]>
14 4859  <tibble [5 × 2]> 
15 4861  <tibble [11 × 2]>
16 4862  <tibble [9 × 2]> 
17 4863  <tibble [2 × 2]> 
18 4864  <tibble [2 × 2]> 
19 4865  <tibble [3 × 2]> 

nest,套嵌数据框

Nesting is often useful for creating per group models

mtcars %>%
  nest(.by = cyl) %>%
  dplyr::mutate(models = lapply(data, function(df) lm(mpg ~ wt, data = df)))
# A tibble: 3 × 3
    cyl data               models
  <dbl> <list>             <list>
1     6 <tibble [7 × 10]>  <lm>  
2     4 <tibble [11 × 10]> <lm>  
3     8 <tibble [14 × 10]> <lm>  

练习

# A tibble: 20,088 × 20
   datetime            site  `CO_mg/m3` `CO_24h_mg/m3` `NO2_μg/m3`
   <dttm>              <chr>      <dbl>          <dbl>       <dbl>
 1 2024-03-19 01:00:00 1001A        0.1            0.4           5
 2 2024-03-19 01:00:00 1003A        0.2            0.4           9
 3 2024-03-19 01:00:00 1004A        0.2            0.4           4
 4 2024-03-19 01:00:00 1005A        0.1            0.3           6
 5 2024-03-19 01:00:00 1006A        0.1            0.4           5
 6 2024-03-19 01:00:00 1007A        0.3            0.5           6
 7 2024-03-19 01:00:00 1008A        0.2            0.4           2
 8 2024-03-19 01:00:00 1009A        0.2            0.4           2
 9 2024-03-19 01:00:00 1010A        0.1            0.3           2
10 2024-03-19 01:00:00 1011A        0.2            0.4          12
# ℹ 20,078 more rows
# ℹ 15 more variables: `NO2_24h_μg/m3` <dbl>, `O3_μg/m3` <dbl>,
#   `O3_24h_μg/m3` <lgl>, `O3_8h_μg/m3` <lgl>, `O3_8h_24h_μg/m3` <lgl>,
#   `PM10_μg/m3` <dbl>, `PM10_24h_μg/m3` <dbl>, `PM2.5_μg/m3` <dbl>,
#   `PM2.5_24h_μg/m3` <dbl>, `SO2_μg/m3` <dbl>, `SO2_24h_μg/m3` <dbl>,
#   AQI <dbl>, PrimaryPollutant <chr>, Quality <chr>, Unheathful <chr>
airqualitydf
# A tibble: 20,088 × 20
   datetime            site  `CO_mg/m3` `CO_24h_mg/m3` `NO2_μg/m3`
   <dttm>              <chr>      <dbl>          <dbl>       <dbl>
 1 2024-03-19 01:00:00 1001A        0.1            0.4           5
 2 2024-03-19 01:00:00 1003A        0.2            0.4           9
 3 2024-03-19 01:00:00 1004A        0.2            0.4           4
 4 2024-03-19 01:00:00 1005A        0.1            0.3           6
 5 2024-03-19 01:00:00 1006A        0.1            0.4           5
 6 2024-03-19 01:00:00 1007A        0.3            0.5           6
 7 2024-03-19 01:00:00 1008A        0.2            0.4           2
 8 2024-03-19 01:00:00 1009A        0.2            0.4           2
 9 2024-03-19 01:00:00 1010A        0.1            0.3           2
10 2024-03-19 01:00:00 1011A        0.2            0.4          12
# ℹ 20,078 more rows
# ℹ 15 more variables: `NO2_24h_μg/m3` <dbl>, `O3_μg/m3` <dbl>,
#   `O3_24h_μg/m3` <lgl>, `O3_8h_μg/m3` <lgl>, `O3_8h_24h_μg/m3` <lgl>,
#   `PM10_μg/m3` <dbl>, `PM10_24h_μg/m3` <dbl>, `PM2.5_μg/m3` <dbl>,
#   `PM2.5_24h_μg/m3` <dbl>, `SO2_μg/m3` <dbl>, `SO2_24h_μg/m3` <dbl>,
#   AQI <dbl>, PrimaryPollutant <chr>, Quality <chr>, Unheathful <chr>
airqualitynestdf <- airqualitydf |>
  nest(sitedf = -site)

nestgroup_by联用

# A tibble: 3 × 2
# Groups:   Species [3]
  Species    spdf             
  <fct>      <list>           
1 setosa     <tibble [50 × 4]>
2 versicolor <tibble [50 × 4]>
3 virginica  <tibble [50 × 4]>

unnest

airqualitynestdf |> unnest(sitedf)
# A tibble: 20,088 × 20
   site  datetime            `CO_mg/m3` `CO_24h_mg/m3` `NO2_μg/m3`
   <chr> <dttm>                   <dbl>          <dbl>       <dbl>
 1 1001A 2024-03-19 01:00:00        0.1            0.4           5
 2 1001A 2024-03-19 03:00:00        0.1            0.3           6
 3 1001A 2024-03-19 05:00:00        0.2            0.3           8
 4 1001A 2024-03-19 07:00:00        0.2            0.3          12
 5 1001A 2024-03-19 09:00:00        0.3            0.3          19
 6 1001A 2024-03-19 11:00:00        0.2            0.2           9
 7 1001A 2024-03-19 13:00:00        0.2            0.2           7
 8 1001A 2024-03-19 15:00:00        0.1            0.2           5
 9 1001A 2024-03-19 17:00:00        0.2            0.2           3
10 1001A 2024-03-19 19:00:00        0.2            0.2           5
# ℹ 20,078 more rows
# ℹ 15 more variables: `NO2_24h_μg/m3` <dbl>, `O3_μg/m3` <dbl>,
#   `O3_24h_μg/m3` <lgl>, `O3_8h_μg/m3` <lgl>, `O3_8h_24h_μg/m3` <lgl>,
#   `PM10_μg/m3` <dbl>, `PM10_24h_μg/m3` <dbl>, `PM2.5_μg/m3` <dbl>,
#   `PM2.5_24h_μg/m3` <dbl>, `SO2_μg/m3` <dbl>, `SO2_24h_μg/m3` <dbl>,
#   AQI <dbl>, PrimaryPollutant <chr>, Quality <chr>, Unheathful <chr>

purrr

  • map():依次应用一元函数到一个序列的每个元素上,基本等同 lapply()
  • map2():依次应用二元函数到两个序列的每对元素上
  • pmap():应用多元函数到多个序列的每组元素上,可以实现对数据框逐行迭代
  • map 系列默认返回列表型,可根据想要的返回类型添加后缀:_int, _dbl, _lgl, _chr, _df, 甚至可以接着对返回的数据框df做行/列合并:_dfr, _dfc
  • 如果只想要函数依次作用的过程,而不需要返回结果,改用 walk 系列即可
  • 所应用的函数,有 purrr公式风格简写(匿名函数),支持一元,二元,多元函数
  • purrr 包中的其它有用函数

purrr

  • map_chr(.x, .f): 返回字符型向量
  • map_lgl(.x, .f): 返回逻辑型向量
  • map_dbl(.x, .f): 返回实数型向量
  • map_int(.x, .f): 返回整数型向量
  • map_dfr(.x, .f): 返回数据框列表,再 bind_rows 按行合并为一个数据框
  • map_dfc(.x, .f): 返回数据框列表,再 bind_cols 按列合并为一个数据框

purrr包-cheatsheet

dwfun::ggsavep("../../image/cheatsheet/purrr.svg", loadit = TRUE)

purrr

purrr

生成从1到10的10组随机数,每组随机数个数为100,均值依次为1到10,标准差为1,并存储在数据框中。

res <- list()
for (i in 1:10) {
  res[[i]] <- tibble(随机数 = rnorm(n = 100, mean = i, sd = 1))
}
res
[[1]]
# A tibble: 100 × 1
   随机数
    <dbl>
 1 -0.400
 2  1.26 
 3 -1.44 
 4  0.994
 5  1.62 
 6  2.15 
 7 -0.822
 8  0.753
 9  0.756
10  0.717
# ℹ 90 more rows

[[2]]
# A tibble: 100 × 1
   随机数
    <dbl>
 1  1.61 
 2  1.21 
 3  0.943
 4  1.20 
 5  0.244
 6  1.31 
 7  1.44 
 8  1.46 
 9  2.23 
10  2.98 
# ℹ 90 more rows

[[3]]
# A tibble: 100 × 1
   随机数
    <dbl>
 1  2.57 
 2  4.36 
 3  2.93 
 4  2.73 
 5  0.553
 6  3.07 
 7  1.90 
 8  2.37 
 9  0.936
10  5.65 
# ℹ 90 more rows

[[4]]
# A tibble: 100 × 1
   随机数
    <dbl>
 1   3.64
 2   2.94
 3   5.08
 4   5.18
 5   4.20
 6   3.60
 7   4.62
 8   5.97
 9   5.88
10   2.41
# ℹ 90 more rows

[[5]]
# A tibble: 100 × 1
   随机数
    <dbl>
 1   6.74
 2   4.15
 3   4.04
 4   6.02
 5   3.50
 6   3.82
 7   5.63
 8   7.10
 9   4.39
10   3.37
# ℹ 90 more rows

[[6]]
# A tibble: 100 × 1
   随机数
    <dbl>
 1   5.29
 2   6.66
 3   6.29
 4   6.20
 5   4.80
 6   5.96
 7   6.69
 8   6.71
 9   6.99
10   7.14
# ℹ 90 more rows

[[7]]
# A tibble: 100 × 1
   随机数
    <dbl>
 1   7.24
 2   7.63
 3   7.42
 4   8.98
 5   6.49
 6   5.89
 7   6.05
 8   7.48
 9   6.20
10   7.23
# ℹ 90 more rows

[[8]]
# A tibble: 100 × 1
   随机数
    <dbl>
 1   8.89
 2   7.62
 3   8.61
 4   8.00
 5   7.48
 6   7.36
 7   7.36
 8   8.11
 9   9.18
10   8.45
# ℹ 90 more rows

[[9]]
# A tibble: 100 × 1
   随机数
    <dbl>
 1  10.5 
 2   8.58
 3   8.48
 4   9.85
 5   9.33
 6   8.17
 7   8.78
 8   7.45
 9   9.23
10   9.03
# ℹ 90 more rows

[[10]]
# A tibble: 100 × 1
   随机数
    <dbl>
 1  11.4 
 2  11.7 
 3  11.2 
 4   9.85
 5   9.82
 6  10.8 
 7   9.58
 8  11.2 
 9  11.2 
10  10.7 
# ℹ 90 more rows

purrr

生成从1到10的10组随机数,每组随机数个数为100,均值依次为1到10,标准差为1,并存储在数据框中。

1:10 |>
purrr::map(~tibble(随机数 = rnorm(n = 100, mean = .x, sd = 1)))
[[1]]
# A tibble: 100 × 1
   随机数
    <dbl>
 1  0.663
 2  0.784
 3  1.62 
 4 -0.284
 5 -0.300
 6  0.623
 7  1.10 
 8  0.296
 9  2.50 
10  0.697
# ℹ 90 more rows

[[2]]
# A tibble: 100 × 1
   随机数
    <dbl>
 1 0.294 
 2 1.14  
 3 1.86  
 4 1.68  
 5 1.83  
 6 0.764 
 7 0.0977
 8 1.91  
 9 2.03  
10 2.46  
# ℹ 90 more rows

[[3]]
# A tibble: 100 × 1
   随机数
    <dbl>
 1   2.09
 2   6.11
 3   1.93
 4   2.30
 5   2.80
 6   3.74
 7   2.95
 8   4.31
 9   3.09
10   2.37
# ℹ 90 more rows

[[4]]
# A tibble: 100 × 1
   随机数
    <dbl>
 1   2.85
 2   3.73
 3   4.46
 4   3.98
 5   3.46
 6   4.88
 7   4.74
 8   3.84
 9   3.25
10   2.75
# ℹ 90 more rows

[[5]]
# A tibble: 100 × 1
   随机数
    <dbl>
 1   3.86
 2   4.72
 3   4.11
 4   5.14
 5   4.25
 6   5.52
 7   4.81
 8   5.03
 9   5.36
10   4.97
# ℹ 90 more rows

[[6]]
# A tibble: 100 × 1
   随机数
    <dbl>
 1   5.42
 2   5.83
 3   4.08
 4   4.47
 5   4.89
 6   7.60
 7   5.36
 8   7.57
 9   4.55
10   5.21
# ℹ 90 more rows

[[7]]
# A tibble: 100 × 1
   随机数
    <dbl>
 1   6.29
 2   8.48
 3   7.85
 4   8.29
 5   7.30
 6   6.59
 7   6.86
 8   6.78
 9   8.75
10   6.92
# ℹ 90 more rows

[[8]]
# A tibble: 100 × 1
   随机数
    <dbl>
 1   8.48
 2   9.67
 3   7.96
 4   7.56
 5   8.73
 6   7.42
 7   8.07
 8   8.06
 9   8.85
10  10.6 
# ℹ 90 more rows

[[9]]
# A tibble: 100 × 1
   随机数
    <dbl>
 1  10.0 
 2   9.53
 3  10.2 
 4   9.17
 5   9.58
 6   8.88
 7   9.90
 8   8.98
 9   8.08
10   7.94
# ℹ 90 more rows

[[10]]
# A tibble: 100 × 1
   随机数
    <dbl>
 1   9.29
 2   8.64
 3   9.50
 4  11.4 
 5   9.49
 6  10.0 
 7  11.8 
 8  11.0 
 9   9.42
10   8.41
# ℹ 90 more rows

purrr

library(purrr)
mtcars |> 
  split(mtcars$cyl) |>  # from base R
  map(\(df) lm(mpg ~ wt, data = df)) |> 
  map(summary) %>%
  map_dbl("r.squared")
        4         6         8 
0.5086326 0.4645102 0.4229655 

练习:

计算每月最后一个周六的航班数:

flights
# A tibble: 336,776 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      517            515         2      830            819
 2  2013     1     1      533            529         4      850            830
 3  2013     1     1      542            540         2      923            850
 4  2013     1     1      544            545        -1     1004           1022
 5  2013     1     1      554            600        -6      812            837
 6  2013     1     1      554            558        -4      740            728
 7  2013     1     1      555            600        -5      913            854
 8  2013     1     1      557            600        -3      709            723
 9  2013     1     1      557            600        -3      838            846
10  2013     1     1      558            600        -2      753            745
# ℹ 336,766 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

tidyr + purrr

任务:展示不同城市间的大气指标散点图

(airqualitydf <- readxl::read_xlsx("../../data/airquality.xlsx",
  sheet = 2))
# A tibble: 20,088 × 20
   datetime            site  `CO_mg/m3` `CO_24h_mg/m3` `NO2_μg/m3`
   <dttm>              <chr>      <dbl>          <dbl>       <dbl>
 1 2024-03-19 01:00:00 1001A        0.1            0.4           5
 2 2024-03-19 01:00:00 1003A        0.2            0.4           9
 3 2024-03-19 01:00:00 1004A        0.2            0.4           4
 4 2024-03-19 01:00:00 1005A        0.1            0.3           6
 5 2024-03-19 01:00:00 1006A        0.1            0.4           5
 6 2024-03-19 01:00:00 1007A        0.3            0.5           6
 7 2024-03-19 01:00:00 1008A        0.2            0.4           2
 8 2024-03-19 01:00:00 1009A        0.2            0.4           2
 9 2024-03-19 01:00:00 1010A        0.1            0.3           2
10 2024-03-19 01:00:00 1011A        0.2            0.4          12
# ℹ 20,078 more rows
# ℹ 15 more variables: `NO2_24h_μg/m3` <dbl>, `O3_μg/m3` <dbl>,
#   `O3_24h_μg/m3` <lgl>, `O3_8h_μg/m3` <lgl>, `O3_8h_24h_μg/m3` <lgl>,
#   `PM10_μg/m3` <dbl>, `PM10_24h_μg/m3` <dbl>, `PM2.5_μg/m3` <dbl>,
#   `PM2.5_24h_μg/m3` <dbl>, `SO2_μg/m3` <dbl>, `SO2_24h_μg/m3` <dbl>,
#   AQI <dbl>, PrimaryPollutant <chr>, Quality <chr>, Unheathful <chr>

join

Perform left join

(df1 <- data.frame(id = 1:5, value1 = letters[1:5]))
  id value1
1  1      a
2  2      b
3  3      c
4  4      d
5  5      e
(df2 <- data.frame(id = c(2, 4, 6), value2 = LETTERS[1:3]))
  id value2
1  2      A
2  4      B
3  6      C
left_join(df1, df2, by = "id")
  id value1 value2
1  1      a   <NA>
2  2      b      A
3  3      c   <NA>
4  4      d      B
5  5      e   <NA>

left join

Create sample data frames with non-matching rows

(df1 <- data.frame(id = 1:5, value1 = letters[1:5]))
  id value1
1  1      a
2  2      b
3  3      c
4  4      d
5  5      e
(df2 <- data.frame(id = c(2, 4, 6), value2 = LETTERS[1:3]))
  id value2
1  2      A
2  4      B
3  6      C
left_join(df2, df1, by = "id")
  id value2 value1
1  2      A      b
2  4      B      d
3  6      C   <NA>

left join

Create sample data frames with multiple columns

df1 <- data.frame(id1 = c(1, 2, 3), id2 = c("A", "B", "C"), value1 = letters[1:3])
df2 <- data.frame(id1 = c(2, 3, 4), id2 = c("B", "C", "D"), value2 = LETTERS[1:3])
# Perform left join
left_join(df1, df2, by = c("id1", "id2"))
  id1 id2 value1 value2
1   1   A      a   <NA>
2   2   B      b      A
3   3   C      c      B

right join

(df1 <- data.frame(id = 1:5, value1 = letters[1:5]))
  id value1
1  1      a
2  2      b
3  3      c
4  4      d
5  5      e
(df2 <- data.frame(id = c(2, 4, 6), value2 = LETTERS[1:3]))
  id value2
1  2      A
2  4      B
3  6      C
# Perform right join
right_join(df1, df2, by = "id")
  id value1 value2
1  2      b      A
2  4      d      B
3  6   <NA>      C

inner join

# Create sample data frames
(df1 <- data.frame(id = 1:5, value1 = letters[1:5]))
  id value1
1  1      a
2  2      b
3  3      c
4  4      d
5  5      e
(df2 <- data.frame(id = c(2, 4, 6), value2 = LETTERS[1:3]))
  id value2
1  2      A
2  4      B
3  6      C
# Perform inner join
inner_join(df1, df2, by = "id")
  id value1 value2
1  2      b      A
2  4      d      B

full join

# Create sample data frames
(df1 <- data.frame(id = 1:5, value1 = letters[1:5]))
  id value1
1  1      a
2  2      b
3  3      c
4  4      d
5  5      e
(df2 <- data.frame(id = c(2, 4, 6), value2 = LETTERS[1:3]))
  id value2
1  2      A
2  4      B
3  6      C
# Perform inner join
full_join(df1, df2, by = "id")
  id value1 value2
1  1      a   <NA>
2  2      b      A
3  3      c   <NA>
4  4      d      B
5  5      e   <NA>
6  6   <NA>      C

semi join

Create sample data frames

(df1 <- data.frame(id = 1:5, value1 = letters[1:5]))
  id value1
1  1      a
2  2      b
3  3      c
4  4      d
5  5      e
(df2 <- data.frame(id = c(2, 4, 6), value2 = LETTERS[1:3]))
  id value2
1  2      A
2  4      B
3  6      C
# Perform semi join
semi_join(df1, df2, by = "id")
  id value1
1  2      b
2  4      d

练习

合并airquality.xlsx中的数据。

练习

统计各城市白天与晚上的大气质量差异,计算不同指标差异最大的10个城市。

欢迎讨论!

苏命|https://drwater.rcees.ac.cn; https://drwater.rcees.ac.cn/bcard; Slides