增加第8次课课后作业示例代码

This commit is contained in:
Ming Su 2024-04-05 12:19:23 +08:00
parent 7c06e29cbe
commit 51973491d4
11 changed files with 489 additions and 74 deletions

Binary file not shown.

Binary file not shown.

View File

@ -4,6 +4,9 @@ subtitle: 《区域水环境污染数据分析实践》<br>Data analysis practic
author: 苏命、王为东<br>中国科学院大学资源与环境学院<br>中国科学院生态环境研究中心
date: today
lang: zh
resources:
- "*.pdf"
- "*.sas"
format:
revealjs:
theme: dark
@ -42,6 +45,19 @@ require(learnr)
作业模板:[第8次课后作业_模板.qmd](https://drwater.rcees.ac.cn/git/course/RWEP/raw/branch/main/SD/20240328_9_课后作业/第8次课后作业_模板.qmd)
## 示例代码
### 基于R的示例结果
- [第8次课后作业R示例代码结果](./第8次课后作业_模板.html)
### 基于SAS的示例结果
- [第8次课后作业SAS示例代码](./第8次课后作业_模板.sas)
- [第8次课后作业SAS示例结果1](./median.pdf)
- [第8次课后作业SAS示例结果2](./freq.pdf)
- [第8次课后作业SAS示例结果3](./airqualitymedianoutrow5.pdf)
- [第8次课后作业SAS示例结果4](./npar1wayConover.pdf)
## 欢迎讨论!{.center}

Binary file not shown.

Binary file not shown.

View File

@ -4,24 +4,34 @@ author: 姓名
format: html
---
# 下载airquality.xlsx并读取数据
# 数据
下载airquality.xlsx并读取数据。
```{r}
#| eval: false
#| execute: false
#| message: false
#| warning: false
# 下载至临时文件
tmpxlsxpath <- file.path(tempdir(), "airquality.xlsx")
download.file("https://drwater.rcees.ac.cn/git/course/RWEP/raw/branch/PUB/data/airquality.xlsx",
destfile = tmpxlsxpath)
airqualitydf <- readxl::read_xlsx(tmpxlsxpath, sheet = 2)
metadf <- readxl::read_xlsx(tmpxlsxpath, sheet = 1)
if (FALSE) {
tmpxlsxpath <- file.path(tempdir(), "airquality.xlsx")
download.file("https://drwater.rcees.ac.cn/git/course/RWEP/raw/branch/PUB/data/airquality.xlsx",
destfile = tmpxlsxpath)
airqualitydf <- readxl::read_xlsx(tmpxlsxpath, sheet = 2)
metadf <- readxl::read_xlsx(tmpxlsxpath, sheet = 1)
saveRDS(airqualitydf, "./airqualitydf.RDS")
saveRDS(metadf, "./metadf.RDS")
}
airqualitydf <- readRDS("./airqualitydf.RDS")
metadf <- readRDS("./metadf.RDS")
```
# 根据`airqualitydf.xlsx`按采样点统计白天8:00-20:00与夜晚20:00-8:00中空气质量指数AQI中位数按城市统计低于所有采样点AQI30%分位值的采样点占比列出上述占比最高的10个城市不考虑采样点数低于5个的城市
# 描述统计
根据`airqualitydf.xlsx`按采样点统计白天8:00-20:00与夜晚20:00-8:00中空气质量指数AQI中位数按城市统计低于所有采样点AQI30%分位值的采样点占比列出上述占比最高的10个城市不考虑采样点数低于5个的城市
```{r}
#| eval: false
#| execute: false
#| message: false
#| warning: false
require(tidyverse)
airqualitydf |>
select(datetime, site, AQI) |>
@ -49,70 +59,90 @@ airqualitydf |>
filter(length(unique(site)) >= 5) |>
summarize(p = sum(AQI < quantile(airqualitydf$AQI, probs = 0.2,
na.rm = TRUE)) / n()) |>
slice_max(p, n = 10)
```
# 按照不同城市分组统计白天与夜晚AQI中位数是否具有显著差异。
```{r}
#| eval: false
if (FALSE) {
require(infer)
testdf <- airqualitydf |>
select(datetime, site, AQI) |>
filter(!is.na(AQI)) |>
left_join(metadf |> select(site, city = Area)) |>
group_by(city) |>
filter(length(unique(site)) >= 5) |>
mutate(dayornight = factor(ifelse(between(hour(datetime), 8, 20), "day", "night"),
levels = c("day", "night"))
) |>
group_by(city) |>
nest(citydf = -city) |>
mutate(median_diff = purrr::map_dbl(citydf, ~
.x |>
specify(AQI ~ dayornight) |>
calculate(stat = "diff in medians", order = c("day", "night")) |>
pull(stat)
)) |>
ungroup() |>
# slice_sample(n = 12) |>
mutate(null_dist = purrr::map(citydf, ~
.x |>
specify(AQI ~ dayornight) |>
hypothesize(null = "independence") |>
generate(reps = 1000, type = "permute") |>
calculate(stat = "diff in medians", order = c("day", "night"))
)) |>
mutate(fig = purrr::pmap(list(null_dist, median_diff, city),
~ visualize(..1) +
shade_p_value(obs_stat = ..2, direction = "both") +
ggtitle(..3)
)) |>
mutate(p_value = purrr::map2_dbl(null_dist, median_diff,
~ get_p_value(.x, obs_stat = .y, direction = "both") |>
pull(p_value)
)) |>
arrange(p_value) |>
mutate(sigdiff = ifelse(p_value < 0.01, "显著差异", "无显著差异"))
testdf |>
select(city, sigdiff) |>
slice_max(p, n = 10) |>
knitr::kable()
lang <- "cn"
(testdf |>
slice_sample(n = 9) |>
pull(fig)) |>
patchwork::wrap_plots(ncol = 3) +
dwfun::theme_sci(5, 5)
dwfun::ggsavep("./testdf.pdf")
}
```
# 统计检验
按照不同城市分组统计白天与夜晚AQI中位数是否具有显著差异。
```{r}
#| message: false
#| warning: false
if (FALSE) {
require(infer)
require(tidyverse)
testdf <- airqualitydf |>
select(datetime, site, AQI) |>
filter(!is.na(AQI)) |>
left_join(metadf |> select(site, city = Area)) |>
group_by(city) |>
filter(length(unique(site)) >= 5) |>
mutate(dayornight = factor(ifelse(between(hour(datetime), 8, 20), "day", "night"),
levels = c("day", "night"))
) |>
group_by(city) |>
nest(citydf = -city) |>
mutate(median_diff = purrr::map_dbl(citydf, ~
.x |>
specify(AQI ~ dayornight) |>
calculate(stat = "diff in medians", order = c("day", "night")) |>
pull(stat)
)) |>
ungroup() |>
# slice_sample(n = 12) |>
mutate(null_dist = purrr::map(citydf, ~
.x |>
specify(AQI ~ dayornight) |>
hypothesize(null = "independence") |>
generate(reps = 1000, type = "permute") |>
calculate(stat = "diff in medians", order = c("day", "night"))
)) |>
mutate(p_value = purrr::map2_dbl(null_dist, median_diff,
~ get_p_value(.x, obs_stat = .y, direction = "both") |>
pull(p_value)
)) |>
mutate(sigdiff = ifelse(p_value < 0.01, "显著差异", "无显著差异")) |>
mutate(fig = purrr::pmap(list(null_dist, median_diff, city, sigdiff),
~ visualize(..1) +
shade_p_value(obs_stat = ..2, direction = "both") +
ggtitle(paste0(..3, "", ..4)) +
theme_sci(2, 2)
)) |>
arrange(p_value)
saveRDS(testdf, "./testdf.RDS")
}
lang <- "cn"
require(dwfun)
require(rmdify)
require(drwateR)
dwfun::init()
rmdify::rmd_init()
testdf <- readRDS("./testdf.RDS")
require(tidyverse)
testdf |>
select(city, median_diff, p_value, sigdiff) |>
knitr::kable()
testdf |>
mutate(grp = (row_number() - 1)%/% 12) |>
group_by(grp) |>
nest(grpdf = -grp) |>
ungroup() |>
# slice(1) |>
mutate(gp = purrr::map(grpdf,
~(.x |>
pull(fig)) |>
patchwork::wrap_plots(ncol = 3) +
dwfun::theme_sci(5, 7))) |>
pull(gp)
```

View File

@ -0,0 +1,289 @@
options ls=256 ps=32767 nodate validmemname=extend validvarname=any;
title 'The SAS System';
%macro print(d);
proc print data=&d;run;
%mend;
%macro printobs(d,obs);
proc print data=&d (obs=&obs);run;
%mend;
%macro printfirstobsobs(d,firstobs,obs);
proc print data=&d (firstobs=&firstobs obs=&obs);run;
%mend;
%macro contents(d);
proc contents data=&d varnum;run;
%mend;
%macro contentsshort(d);
proc contents data=&d varnum short;run;
%mend;
%macro save_dataset(d);
data "d:&d";
set &d;
run;
%mend;
%macro load_dataset(d);
data &d;
set "d:&d";
run;
%mend;
%macro kill;
PROC DATASETS LIB=work KILL;RUN;quit;
%mend;
proc template;
list styles;
run;
%kill;
PROC IMPORT OUT=WORK.raw_metadf
DATAFILE="d:airquality.xlsx"
DBMS=EXCEL REPLACE;
RANGE="metadf$";
GETNAMES=YES;
MIXED=YES;
SCANTEXT=YES;
USEDATE=NO;
SCANTIME=NO;
RUN;
%print(raw_metadf);
%save_dataset(raw_metadf); *原始数据集存盘;
PROC IMPORT OUT=WORK.raw_airqualitydf
DATAFILE="d:airquality.xlsx"
DBMS=EXCEL REPLACE;
RANGE="airqualitydf$";
GETNAMES=YES;
MIXED=YES;
SCANTEXT=YES;
USEDATE=NO; *为YES崩溃闪退;
SCANTIME=NO; *为YES崩溃闪退;
RUN;
%print(raw_airqualitydf);
%save_dataset(raw_airqualitydf); *原始数据集存盘;
%kill;
/*加载硬盘原始数据集*/
%load_dataset(raw_metadf);
%load_dataset(raw_airqualitydf);
/*查看数据集内容*/
%contents(raw_metadf);
%contentsshort(raw_metadf);
/*site name Area lon lat*/
%contents(raw_airqualitydf);
%contentsshort(raw_airqualitydf);
/*datetime site 'CO_mg/m3'n 'CO_24h_mg/m3'n 'NO2_μg/m3'n 'NO2_24h_μg/m3'n 'O3_μg/m3'n 'O3_24h_μg/m3'n 'O3_8h_μg/m3'n 'O3_8h_24h_μg/m3'n 'PM10_μg/m3'n 'PM10_24h_μg/m3'n 'PM2#5_μg/m3'n 'PM2#5_24h_μg/m3'n 'SO2_μg/m3'n 'SO2_24h_μg/m3'n AQI PrimaryPollutant Quality Unheathful*/
%printobs(raw_metadf,10);
%printobs(raw_airqualitydf,10);
proc sort data=raw_metadf out=metadfsorted;
by site;
run;
proc sort data=raw_airqualitydf out=airqualitydfsorted;
by site;
run;
/*合并数据集数据预处理提取日期、时间部分划分day、night*/
data airquality;
retain datetime date time DayNight site name Area AQI lon lat;
length DayNight $ 5;
merge metadfsorted airqualitydfsorted;
by site;
date=datepart(datetime);
time=timepart(datetime);
if '8:00't<=time<'20:00't then DayNight='day';
else DayNight='night';
format datetime e8601dt25. date yymmdd10. time time5.;
keep site name Area lon lat datetime date time DayNight AQI;
run;
%printobs(airquality,100);
%save_dataset(airquality); *合并数据集存盘;
%kill;
/*#################### DATA SET airquality ####################*/
/*加载硬盘合并数据集*/
%load_dataset(airquality);
%printobs(airquality,100);
/*检查site、name数量是否一致发现不一致后以site进行统计*/
proc sql;
select count(distinct(site)) as count_site from airquality;
select count(distinct(name)) as count_name from airquality;
quit;
/*
count_site
1714
count_name
1522
*/
/*#########################################################################*/
/*按采样点统计白天8:00-20:00与夜晚20:00-8:00中空气质量指数AQI中位数*/
/*#########################################################################*/
/*@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@*/
ods pdf file='d:means.pdf' style=sapphire dpi=1200;
proc means data=airquality median maxdec=1;
class site DayNight;
var AQI;
where AQI is not missing;
run;
ods pdf close;
/*@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@*/
/*####################################################################################################*/
/*按城市统计低于所有采样点AQI30%分位值的采样点占比列出上述占比最高的10个城市不考虑采样点数低于5个的城市*/
/*####################################################################################################*/
/*输出查看所有采样点AQI30%分位值为与SQL验证可略去*/
proc univariate data=airquality noprint;
var AQI;
output out=airqualitystats pctlpts=30 pctlpre=P;
run;
%print(airqualitystats); /*42*/
/*输出所有采样点AQI30%分位值到宏变量*/
proc sql;
select AQI into : xvalues separated by ',' from airquality;
select distinct(pctl(30, &xvalues)) into : P30 from airquality;
quit;
/*查看所有采样点AQI30%分位值的宏变量值,为后续调用*/
%put P30=&P30.;
/*按所有采样点AQI30%分位值对AQI分级对合并数据集所有采样点所有数据直接分级后续用各采样点中位数进行统计可略去*/
data airquality1;
set airquality;
if AQI<&P30. then quality='good';
else quality='fair';
run;
%printobs(airquality1,100);
/*输出所有采样点AQI中位数*/
proc means data=airquality median maxdec=1;
class Area site;
var AQI;
where AQI is not missing;
output out=airqualitymedian median=;
run;
%print(airqualitymedian);
/*按所有采样点AQI30%分位值对AQI中位数分级*/
data airqualitymedian1;
set airqualitymedian;
if AQI<&P30. then quality='good';
else quality='fair';
where _TYPE_=3;
run;
%print(airqualitymedian1);
/*按城市统计低于所有采样点AQI30%分位值的采样点占比,查看结果*/
/*@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@*/
ods pdf file='d:freq.pdf' style=sapphire dpi=1200;
proc freq data=airqualitymedian1;
table Area*quality /nocol nopercent;
run;
ods pdf close;
/*@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@*/
/*按城市统计低于所有采样点AQI30%分位值的采样点占比,输出频数统计结果到数据集*/
proc freq data=airqualitymedian1;
table Area*quality /outpct out=airqualitymedianoutrow(drop=percent pct_col); *保留行列频数与行百分比;
run;
%printobs(airqualitymedianoutrow,100);
/*按城市对采样点数进行统计,查看结果,可略去*/
proc means data=airqualitymedianoutrow sum maxdec=0;
class Area;
var COUNT;
run;
/*输出采样点数不低于5个的城市查看结果可略去*/
proc sql;
select *,sum(COUNT) as total_COUNT from airqualitymedianoutrow group by Area having calculated total_COUNT>=5 order by quality desc,PCT_ROW desc,COUNT desc;
quit;
/*将采样点数不低于5个的城市输出到数据集*/
proc sql;
create table airqualitymedianoutrow5 as select *,sum(COUNT) as total_COUNT from airqualitymedianoutrow group by Area having calculated total_COUNT>=5 order by quality desc,PCT_ROW desc,COUNT desc;
quit;
/*列出上述占比最高的10个城市不含采样点数低于5个的城市*/
/*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/
ods pdf file='d:airqualitymedianoutrow5.pdf' style=sapphire dpi=1200;
%printobs(airqualitymedianoutrow5,10);
%printobs(airqualitymedianoutrow5,20);
%printobs(airqualitymedianoutrow5,30);
%print(airqualitymedianoutrow5);
ods pdf close;
/*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/
/*#####################################################*/
/*按照不同城市分组统计白天与夜晚AQI中位数是否具有显著差异*/
/*#####################################################*/
/*发现有的site没有Area对应*/
proc sql;
select distinct(Area),count(distinct(Area)) as count_Area from airquality;
quit;
proc print data=airquality;
where Area is missing;
run;
proc sql;
select distinct(site),count(distinct(site)) as count_site from airquality where Area is missing;
quit;
/*有4个site采样点没有Area城市对应
site count_site
2628A 4
3128A 4
4034A 4
4036A 4
*/
proc sort data=airquality out=airqualitysorted;
by Area;
run;
/*按照不同城市分组统计白天与夜晚AQI中位数查看结果可略去*/
proc means data=airqualitysorted median maxdec=1;
by Area;
class DayNight;
var AQI;
where AQI is not missing;
run;
/*笼统地看白天与夜晚AQI中位数是否具有显著差异*/
proc npar1way data=airquality median;
class DayNight;
var AQI;
run;
/*按照不同城市分组统计白天与夜晚AQI中位数是否具有显著差异*/
/*@@@@@@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@@@@*/
ods pdf file='d:npar1waymedian.pdf' style=sapphire dpi=1200;
proc npar1way data=airqualitysorted median;
class DayNight;
var AQI;
by Area;
where Area is not missing;
run;
ods pdf close;
/*@@@@@@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@@@@*/
/*
Using Wilcoxon scores in the linear rank statistic for two-sample data produces the rank sum statistic of the Mann-Whitney-Wilcoxon test.
Using Wilcoxon scores in the one-way ANOVA statistic produces the Kruskal-Wallis test.
Wilcoxon scores are locally most powerful for location shifts of a logistic distribution.
*//*
Using median scores in the linear rank statistic for two-sample data produces the two-sample median test.
The one-way ANOVA statistic with median scores is equivalent to the Brown-Mood test.
Median scores are particularly powerful for distributions that are symmetric and heavy-tailed.*/
/*
Scores for Linear Rank and One-Way ANOVA Tests
For each score type that you specify, PROC NPAR1WAY computes a one-way ANOVA statistic and also a linear rank statistic for two-sample data. The following score types are used primarily to test for differences in location: Wilcoxon, median, Van der Waerden (normal), and Savage. The following scores types are used to test for scale differences: Siegel-Tukey, Ansari-Bradley, Klotz, and Mood. Conover scores can be used to test for differences in both location and scale. This section gives formulas for the score types available in PROC NPAR1WAY. For further information about the formulas and the applicability of each score, see Randles and Wolfe (1979), Gibbons and Chakraborti (2010), Conover (1999), and Hollander and Wolfe (1999).
In addition to the score types described in this section, you can specify the SCORES=DATA option to use the input data observations as scores. This enables you to produce a wide variety of tests. You can construct any scores by using the DATA step, and then you can use PROC NPAR1WAY to compute the corresponding linear rank and one-way ANOVA tests for these scores. You can also analyze raw (unscored) data by using the SCORES=DATA option; for two-sample data, the corresponding exact test is a permutation test that is known as Pitmans test.
*/
/*@@@@@@@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@@@@*/
ods pdf file='d:npar1wayConover.pdf' style=sapphire dpi=1200;
proc npar1way data=airqualitysorted Conover;
class DayNight;
var AQI;
by Area;
where Area is not missing;
run;
/*@@@@@@@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@@@@*/
/*Conover scores can be used to test for differences in both location and scale.*/

View File

@ -24,7 +24,7 @@ website:
page-navigation: true
page-footer: "Copyright 2024, [Ming Su](https://drwater.rcees.ac.cn)"
navbar:
background: "light"
background: "grey"
search: true
right:
- icon: house

80
coding/lesson9.qmd Normal file
View File

@ -0,0 +1,80 @@
---
title: 第9次课练习
---
```{r}
require(tidyverse)
# require(ggplot2)
require(palmerpenguins)
penguins |>
select(species, flipper_length_mm, body_mass_g,
bill_length_mm, sex) |>
filter(!is.na(sex)) |>
ggplot(aes(
x = species,
y = body_mass_g
)) +
geom_jitter(colour = "white",
fill = "gray80", size = 1, shape = 21, width = 0.4) +
geom_violin(fill = "orange", alpha = 0.4) +
stat_summary(fun = mean) +
scale_size_continuous(range = c(.5, 2)) +
# ggsci::scale_fill_d3() +
ylab("Body mass (g)") +
ggtitle("Body mass and flipper length") +
theme(axis.text.x = element_text(angle = 90),
panel.background = element_rect(fill = NA)
)
ggplot2::last_plot()
ggsave("../img/penguins.pdf", width = 4, heigh = 3)
+
dwfun::theme_sci(5, 5)
p1 <- ggplot(penguins, aes(x = island, fill = species)) +
geom_bar(position = "fill")
p2 <- ggplot(penguins, aes(x = species, fill = island)) +
geom_bar(position = "fill")
require(patchwork)
((p1 / p2) | p1)
```
# 消光系数
```{r}
licordata <- readRDS("../data/licordata.RDS")
require(tidyverse)
licordata |>
group_by(date, site) |>
nest(.key = "lightdf") |>
mutate(m = purrr::map(
lightdf,
~ (lm(log(UP + 1) ~ depth, data = .x))
)) |>
mutate(k = purrr::map_dbl(m, ~
-coef(.x)[2])) |>
ggplot(aes(date, k)) +
geom_point(aes(fill = site))
Iz = I0 * exp(-zk)
```

Binary file not shown.

BIN
img/penguins.pdf Normal file

Binary file not shown.