Data import

《区域水环境污染数据分析实践》
Data analysis practice of regional water environment pollution

苏命、王为东
中国科学院大学资源与环境学院
中国科学院生态环境研究中心

2024-04-09

tidyverse风格数据分析总体流程

导入csv数据

read_lines("../../data/students.csv") |> cat(sep = "\n")
Student ID,Full Name,favourite.food,mealPlan,AGE
1,Sunil Huffmann,Strawberry yoghurt,Lunch only,4
2,Barclay Lynn,French fries,Lunch only,5
3,Jayendra Lyne,N/A,Breakfast and lunch,7
4,Leon Rossini,Anchovies,Lunch only,
5,Chidiegwu Dunkel,Pizza,Breakfast and lunch,five
6,Güvenç Attila,Ice cream,Lunch only,6

导入csv数据

read_csv("../../data/students.csv") |>
  knitr::kable()
Student ID Full Name favourite.food mealPlan AGE
1 Sunil Huffmann Strawberry yoghurt Lunch only 4
2 Barclay Lynn French fries Lunch only 5
3 Jayendra Lyne N/A Breakfast and lunch 7
4 Leon Rossini Anchovies Lunch only NA
5 Chidiegwu Dunkel Pizza Breakfast and lunch five
6 Güvenç Attila Ice cream Lunch only 6

读取数据

(students <- read_csv("../../data/students.csv"))
# A tibble: 6 × 5
  `Student ID` `Full Name`      favourite.food     mealPlan            AGE  
         <dbl> <chr>            <chr>              <chr>               <chr>
1            1 Sunil Huffmann   Strawberry yoghurt Lunch only          4    
2            2 Barclay Lynn     French fries       Lunch only          5    
3            3 Jayendra Lyne    N/A                Breakfast and lunch 7    
4            4 Leon Rossini     Anchovies          Lunch only          <NA> 
5            5 Chidiegwu Dunkel Pizza              Breakfast and lunch five 
6            6 Güvenç Attila    Ice cream          Lunch only          6    

读取数据

(students <- read_csv("../../data/students.csv", na = c("N/A", "")))
# A tibble: 6 × 5
  `Student ID` `Full Name`      favourite.food     mealPlan            AGE  
         <dbl> <chr>            <chr>              <chr>               <chr>
1            1 Sunil Huffmann   Strawberry yoghurt Lunch only          4    
2            2 Barclay Lynn     French fries       Lunch only          5    
3            3 Jayendra Lyne    <NA>               Breakfast and lunch 7    
4            4 Leon Rossini     Anchovies          Lunch only          <NA> 
5            5 Chidiegwu Dunkel Pizza              Breakfast and lunch five 
6            6 Güvenç Attila    Ice cream          Lunch only          6    

列名不要有空格

students |> 
  rename(
    student_id = `Student ID`,
    full_name = `Full Name`
  )
# A tibble: 6 × 5
  student_id full_name        favourite.food     mealPlan            AGE  
       <dbl> <chr>            <chr>              <chr>               <chr>
1          1 Sunil Huffmann   Strawberry yoghurt Lunch only          4    
2          2 Barclay Lynn     French fries       Lunch only          5    
3          3 Jayendra Lyne    <NA>               Breakfast and lunch 7    
4          4 Leon Rossini     Anchovies          Lunch only          <NA> 
5          5 Chidiegwu Dunkel Pizza              Breakfast and lunch five 
6          6 Güvenç Attila    Ice cream          Lunch only          6    

janitor处理空格

students |> janitor::clean_names()
# A tibble: 6 × 5
  student_id full_name        favourite_food     meal_plan           age  
       <dbl> <chr>            <chr>              <chr>               <chr>
1          1 Sunil Huffmann   Strawberry yoghurt Lunch only          4    
2          2 Barclay Lynn     French fries       Lunch only          5    
3          3 Jayendra Lyne    <NA>               Breakfast and lunch 7    
4          4 Leon Rossini     Anchovies          Lunch only          <NA> 
5          5 Chidiegwu Dunkel Pizza              Breakfast and lunch five 
6          6 Güvenç Attila    Ice cream          Lunch only          6    

janitor处理空格

students |>
  janitor::clean_names() |>
  mutate(meal_plan = factor(meal_plan))
# A tibble: 6 × 5
  student_id full_name        favourite_food     meal_plan           age  
       <dbl> <chr>            <chr>              <fct>               <chr>
1          1 Sunil Huffmann   Strawberry yoghurt Lunch only          4    
2          2 Barclay Lynn     French fries       Lunch only          5    
3          3 Jayendra Lyne    <NA>               Breakfast and lunch 7    
4          4 Leon Rossini     Anchovies          Lunch only          <NA> 
5          5 Chidiegwu Dunkel Pizza              Breakfast and lunch five 
6          6 Güvenç Attila    Ice cream          Lunch only          6    

janitor处理空格

students <- students |>
  janitor::clean_names() |>
  mutate(
    meal_plan = factor(meal_plan),
    age = parse_number(if_else(age == "five", "5", age))
  )
students
# A tibble: 6 × 5
  student_id full_name        favourite_food     meal_plan             age
       <dbl> <chr>            <chr>              <fct>               <dbl>
1          1 Sunil Huffmann   Strawberry yoghurt Lunch only              4
2          2 Barclay Lynn     French fries       Lunch only              5
3          3 Jayendra Lyne    <NA>               Breakfast and lunch     7
4          4 Leon Rossini     Anchovies          Lunch only             NA
5          5 Chidiegwu Dunkel Pizza              Breakfast and lunch     5
6          6 Güvenç Attila    Ice cream          Lunch only              6

直接录入

read_csv(
  "The first line of metadata
  The second line of metadata
  x,y,z
  1,2,3",
  skip = 2
)
# A tibble: 1 × 3
      x     y     z
  <dbl> <dbl> <dbl>
1     1     2     3

直接录入

read_csv(
  "# A comment I want to skip
  x,y,z
  1,2,3",
  comment = "#"
)
# A tibble: 1 × 3
      x     y     z
  <dbl> <dbl> <dbl>
1     1     2     3

指定列名

read_csv(
  "1,2,3
  4,5,6",
  col_names = c("x", "y", "z")
)
# A tibble: 2 × 3
      x     y     z
  <dbl> <dbl> <dbl>
1     1     2     3
2     4     5     6

指定列的类型

another_csv <- "
x,y,z
1,2,3"

read_csv(
  another_csv, 
  col_types = cols(.default = col_character())
)
# A tibble: 1 × 3
  x     y     z    
  <chr> <chr> <chr>
1 1     2     3    
read_csv(
  another_csv,
  col_types = cols_only(x = col_character())
)
# A tibble: 1 × 1
  x    
  <chr>
1 1    

练习

read_csv("a,b\n1,2,3\n4,5,6")
read_csv("a,b,c\n1,2\n1,2,3,4")
read_csv("a,b\n\"1")
read_csv("a,b\n1,2\na,b")
read_csv("a;b\n1;3")

练习

annoying <- tibble(
  `1` = 1:10,
  `2` = `1` * 2 + rnorm(length(`1`))
)

批量读取

sales_files <- c("../../data/01-sales.csv",
  "../../data/02-sales.csv",
  "../../data/03-sales.csv")
read_csv(sales_files, id = "file")
# A tibble: 19 × 6
   file                    month     year brand  item     n
   <chr>                   <chr>    <dbl> <dbl> <dbl> <dbl>
 1 ../../data/01-sales.csv January   2019     1  1234     3
 2 ../../data/01-sales.csv January   2019     1  8721     9
 3 ../../data/01-sales.csv January   2019     1  1822     2
 4 ../../data/01-sales.csv January   2019     2  3333     1
 5 ../../data/01-sales.csv January   2019     2  2156     9
 6 ../../data/01-sales.csv January   2019     2  3987     6
 7 ../../data/01-sales.csv January   2019     2  3827     6
 8 ../../data/02-sales.csv February  2019     1  1234     8
 9 ../../data/02-sales.csv February  2019     1  8721     2
10 ../../data/02-sales.csv February  2019     1  1822     3
11 ../../data/02-sales.csv February  2019     2  3333     1
12 ../../data/02-sales.csv February  2019     2  2156     3
13 ../../data/02-sales.csv February  2019     2  3987     6
14 ../../data/03-sales.csv March     2019     1  1234     3
15 ../../data/03-sales.csv March     2019     1  3627     1
16 ../../data/03-sales.csv March     2019     1  8820     3
17 ../../data/03-sales.csv March     2019     2  7253     1
18 ../../data/03-sales.csv March     2019     2  8766     3
19 ../../data/03-sales.csv March     2019     2  8288     6

读取Excel,建议用readxl

(surveydf <- readxl::read_xlsx("../../data/survey.xlsx"))
# A tibble: 6 × 2
  survey_id n_pets
      <dbl> <chr> 
1         1 0     
2         2 1     
3         3 N/A   
4         4 two   
5         5 2     
6         6 <NA>  

读取Excel

(airqualitydf <- readxl::read_xlsx("../../data/airquality.xlsx", sheet = 2))
# A tibble: 20,088 × 20
   datetime            site  `CO_mg/m3` `CO_24h_mg/m3` `NO2_μg/m3`
   <dttm>              <chr>      <dbl>          <dbl>       <dbl>
 1 2024-03-19 01:00:00 1001A        0.1            0.4           5
 2 2024-03-19 01:00:00 1003A        0.2            0.4           9
 3 2024-03-19 01:00:00 1004A        0.2            0.4           4
 4 2024-03-19 01:00:00 1005A        0.1            0.3           6
 5 2024-03-19 01:00:00 1006A        0.1            0.4           5
 6 2024-03-19 01:00:00 1007A        0.3            0.5           6
 7 2024-03-19 01:00:00 1008A        0.2            0.4           2
 8 2024-03-19 01:00:00 1009A        0.2            0.4           2
 9 2024-03-19 01:00:00 1010A        0.1            0.3           2
10 2024-03-19 01:00:00 1011A        0.2            0.4          12
# ℹ 20,078 more rows
# ℹ 15 more variables: `NO2_24h_μg/m3` <dbl>, `O3_μg/m3` <dbl>,
#   `O3_24h_μg/m3` <lgl>, `O3_8h_μg/m3` <lgl>, `O3_8h_24h_μg/m3` <lgl>,
#   `PM10_μg/m3` <dbl>, `PM10_24h_μg/m3` <dbl>, `PM2.5_μg/m3` <dbl>,
#   `PM2.5_24h_μg/m3` <dbl>, `SO2_μg/m3` <dbl>, `SO2_24h_μg/m3` <dbl>,
#   AQI <dbl>, PrimaryPollutant <chr>, Quality <chr>, Unheathful <chr>

批量读取

sales_files <- list.files("../../data",
  pattern = "sales\\.csv$", full.names = TRUE)
sales_files
[1] "../../data/01-sales.csv" "../../data/02-sales.csv"
[3] "../../data/03-sales.csv"

写入csv

students
# A tibble: 6 × 5
  student_id full_name        favourite_food     meal_plan             age
       <dbl> <chr>            <chr>              <fct>               <dbl>
1          1 Sunil Huffmann   Strawberry yoghurt Lunch only              4
2          2 Barclay Lynn     French fries       Lunch only              5
3          3 Jayendra Lyne    <NA>               Breakfast and lunch     7
4          4 Leon Rossini     Anchovies          Lunch only             NA
5          5 Chidiegwu Dunkel Pizza              Breakfast and lunch     5
6          6 Güvenç Attila    Ice cream          Lunch only              6
write_csv(students, "students-2.csv")
read_csv("students-2.csv")
# A tibble: 6 × 5
  student_id full_name        favourite_food     meal_plan             age
       <dbl> <chr>            <chr>              <chr>               <dbl>
1          1 Sunil Huffmann   Strawberry yoghurt Lunch only              4
2          2 Barclay Lynn     French fries       Lunch only              5
3          3 Jayendra Lyne    <NA>               Breakfast and lunch     7
4          4 Leon Rossini     Anchovies          Lunch only             NA
5          5 Chidiegwu Dunkel Pizza              Breakfast and lunch     5
6          6 Güvenç Attila    Ice cream          Lunch only              6

写入Excel

writexl::write_xlsx(students, "../../data/writexldemo.xlsx")

读取数据库,以MySQL为例

if (FALSE) {
  conn <- cctdb::get_dbconn("nationalairquality")
  DBI::dbListTables(conn)
}

读取数据库,以MySQL为例

if (FALSE) {
  conn <- cctdb::get_dbconn("nationalairquality")
  metadf <- tbl(conn, "metadf") |> 
    head(100) |>
    collect()
  DBI::dbDisconnect(conn)
  saveRDS(metadf, file = "../../data/metadfdemo.RDS")
}
metadf <- readRDS(file = "../../data/metadfdemo.RDS")
lang <- "cn"
metadf |>
  ggplot(aes(lon, lat)) +
geom_point(aes(fill = Area)) +
dwfun::theme_sci()

练习

metadf <- readxl::read_xlsx("../../data/airquality.xlsx")
dir.create("../../data/metacity2/")
metadf |>
  nest(sitedf = -site) |>
  mutate(flag = purrr::map2(site, sitedf,
    ~ writexl::write_xlsx(.y, paste0("../../data/metacity2/", .x, ".xlsx"))))

练习

  1. 从“../../data/sales.xlsx”读取第9到13行的数据
  2. 从“../../data/meta_city.xlsx”读取所有的数据,并保存至“../../data/meta_city_onetable1.xlsx”
  3. 从“../../data/metacity/”读取所有的数据,并保存至“../../data/meta_city_onetable2.xlsx”

欢迎讨论!

苏命|https://drwater.rcees.ac.cn; https://drwater.rcees.ac.cn/bcard; Slides