--- title: 课后作业8 author: 姓名 format: html --- # 数据 下载airquality.xlsx,并读取数据。 ```{r} #| message: false #| warning: false # 下载至临时文件 if (FALSE) { tmpxlsxpath <- file.path(tempdir(), "airquality.xlsx") download.file("https://drwater.rcees.ac.cn/git/course/RWEP/raw/branch/PUB/data/airquality.xlsx", destfile = tmpxlsxpath) airqualitydf <- readxl::read_xlsx(tmpxlsxpath, sheet = 2) metadf <- readxl::read_xlsx(tmpxlsxpath, sheet = 1) saveRDS(airqualitydf, "./airqualitydf.RDS") saveRDS(metadf, "./metadf.RDS") } airqualitydf <- readRDS("./airqualitydf.RDS") metadf <- readRDS("./metadf.RDS") ``` # 描述统计 根据`airqualitydf.xlsx`,按采样点统计白天(8:00-20:00)与夜晚(20:00-8:00)中空气质量指数(AQI)中位数,按城市统计低于所有采样点AQI30%分位值的采样点占比,列出上述占比最高的10个城市(不考虑采样点数低于5个的城市)。 ```{r} #| message: false #| warning: false require(tidyverse) airqualitydf |> select(datetime, site, AQI) |> filter(!is.na(AQI)) |> group_by(site) |> summarize(AQI.median = median(AQI, na.rm = TRUE)) |> left_join(metadf |> select(site, city = Area)) |> group_by(city) |> filter(n() > 5) |> summarize(p = sum(AQI.median < quantile(airqualitydf$AQI, probs = 0.5, na.rm = TRUE)) / n()) |> top_n(10, p) airqualitydf |> select(datetime, site, AQI) |> filter(!is.na(AQI)) |> group_by(site) |> summarize(AQI.median = median(AQI, na.rm = TRUE)) airqualitydf |> select(datetime, site, AQI) |> filter(!is.na(AQI)) |> left_join(metadf |> select(site, city = Area)) |> group_by(city) |> filter(length(unique(site)) >= 5) |> summarize(p = sum(AQI < quantile(airqualitydf$AQI, probs = 0.2, na.rm = TRUE)) / n()) |> slice_max(p, n = 10) |> knitr::kable() ``` # 统计检验 按照不同城市分组,统计白天与夜晚AQI中位数是否具有显著差异。 ```{r} #| message: false #| warning: false if (FALSE) { require(infer) require(tidyverse) testdf <- airqualitydf |> select(datetime, site, AQI) |> filter(!is.na(AQI)) |> left_join(metadf |> select(site, city = Area)) |> group_by(city) |> filter(length(unique(site)) >= 5) |> mutate(dayornight = factor(ifelse(between(hour(datetime), 8, 20), "day", "night"), levels = c("day", "night")) ) |> group_by(city) |> nest(citydf = -city) |> mutate(median_diff = purrr::map_dbl(citydf, ~ .x |> specify(AQI ~ dayornight) |> calculate(stat = "diff in medians", order = c("day", "night")) |> pull(stat) )) |> ungroup() |> # slice_sample(n = 12) |> mutate(null_dist = purrr::map(citydf, ~ .x |> specify(AQI ~ dayornight) |> hypothesize(null = "independence") |> generate(reps = 1000, type = "permute") |> calculate(stat = "diff in medians", order = c("day", "night")) )) |> mutate(p_value = purrr::map2_dbl(null_dist, median_diff, ~ get_p_value(.x, obs_stat = .y, direction = "both") |> pull(p_value) )) |> mutate(sigdiff = ifelse(p_value < 0.01, "显著差异", "无显著差异")) |> mutate(fig = purrr::pmap(list(null_dist, median_diff, city, sigdiff), ~ visualize(..1) + shade_p_value(obs_stat = ..2, direction = "both") + ggtitle(paste0(..3, ":", ..4)) + theme_sci(2, 2) )) |> arrange(p_value) saveRDS(testdf, "./testdf.RDS") } lang <- "cn" require(dwfun) require(rmdify) require(drwateR) dwfun::init() rmdify::rmd_init() testdf <- readRDS("./testdf.RDS") require(tidyverse) testdf |> select(city, median_diff, p_value, sigdiff) |> knitr::kable() testdf |> mutate(grp = (row_number() - 1)%/% 12) |> group_by(grp) |> nest(grpdf = -grp) |> ungroup() |> # slice(1) |> mutate(gp = purrr::map(grpdf, ~(.x |> pull(fig)) |> patchwork::wrap_plots(ncol = 3) + dwfun::theme_sci(5, 7))) |> pull(gp) ```