library(boot)
library(tidyverse)
library(survival)
library(survminer)
library(gt)
library(gtsummary)
1 习题1
数据文件:R自带数据mtcars
1.1 以mpg为被解释变量,建立若干个一元回归模型,解释变量为数据框mtcars中除了mpg之外的其余的变量。在同一张表格中报告这些一元回归模型的估计结果。
Characteristic | N | Beta | 95% CI1 | p-value |
---|---|---|---|---|
cyl | 32 | -2.9 | -3.5, -2.2 | <0.001 |
disp | 32 | -0.04 | -0.05, -0.03 | <0.001 |
hp | 32 | -0.07 | -0.09, -0.05 | <0.001 |
drat | 32 | 7.7 | 4.6, 11 | <0.001 |
wt | 32 | -5.3 | -6.5, -4.2 | <0.001 |
qsec | 32 | 1.4 | 0.27, 2.6 | 0.017 |
vs | 32 | 7.9 | 4.6, 11 | <0.001 |
am | 32 | 7.2 | 3.6, 11 | <0.001 |
gear | 32 | 3.9 | 1.3, 6.6 | 0.005 |
carb | 32 | -2.1 | -3.2, -0.89 | 0.001 |
1 CI = Confidence Interval |
1.2 以mpg为被解释变量, 选取适当的解释变量,做一元回归、二元回归、三元回归,在同一张表格中报告这3个模型的估计结果。
m1 <- mtcars %>% lm(mpg ~ wt, .) %>%
tbl_regression(
estimate_fun = ~ style_number(.x, digits = 3),
pvalue_fun = ~ style_pvalue(.x, digits = 3)
) %>%
add_significance_stars(hide_p = FALSE,
thresholds = c(0.01, 0.05, 0.10),) %>%
bold_p(t = 0.10) %>%
add_glance_table(
include = c(nobs, adj.r.squared, p.value)
)
m2 <- mtcars %>% lm(mpg ~ wt + disp, .) %>%
tbl_regression(
estimate_fun = ~ style_number(.x, digits = 3),
pvalue_fun = ~ style_pvalue(.x, digits = 3)
) %>%
add_significance_stars(hide_p = FALSE,
thresholds = c(0.01, 0.05, 0.10),) %>%
bold_p(t = 0.10) %>%
add_glance_table(
include = c(nobs, adj.r.squared, p.value)
)
m3 <- mtcars %>% lm(mpg ~ wt + disp + cyl, .) %>%
tbl_regression(
estimate_fun = ~ style_number(.x, digits = 3),
pvalue_fun = ~ style_pvalue(.x, digits = 3)
) %>%
add_significance_stars(hide_p = FALSE,
thresholds = c(0.01, 0.05, 0.10),) %>%
bold_p(t = 0.10) %>%
add_glance_table(
include = c(nobs, adj.r.squared, p.value)
)
tbl_merge(list(m1, m2, m3),
tab_spanner = c("**Model 1**", "**Model 2**", "**Model 3**")) %>%
bold_labels() %>%
italicize_levels()
Characteristic | Model 1 | Model 2 | Model 3 | ||||||
---|---|---|---|---|---|---|---|---|---|
Beta1 | SE2 | p-value | Beta1 | SE2 | p-value | Beta1 | SE2 | p-value | |
wt | -5.344*** | 0.559 | <0.001 | -3.351*** | 1.16 | 0.007 | -3.636*** | 1.04 | 0.002 |
No. Obs. | 32 | 32 | 32 | ||||||
Adjusted R² | 0.745 | 0.766 | 0.815 | ||||||
p-value | <0.001 | <0.001 | <0.001 | ||||||
disp | -0.018* | 0.009 | 0.064 | 0.007 | 0.012 | 0.533 | |||
cyl | -1.785*** | 0.607 | 0.007 | ||||||
1 *p<0.1; **p<0.05; ***p<0.01 | |||||||||
2 SE = Standard Error |
1.3 并用anova()函数对上述三个模型进行比较。
model1 <- mtcars %>% lm(mpg ~ wt, .)
model2 <- mtcars %>% lm(mpg ~ wt + disp, .)
model3 <- mtcars %>% lm(mpg ~ wt + disp + cyl, .)
anova(model1, model2)
Analysis of Variance Table
Model 1: mpg ~ wt
Model 2: mpg ~ wt + disp
Res.Df RSS Df Sum of Sq F Pr(>F)
1 30 278.32
2 29 246.68 1 31.639 3.7195 0.06362 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Analysis of Variance Table
Model 1: mpg ~ wt
Model 2: mpg ~ wt + disp + cyl
Res.Df RSS Df Sum of Sq F Pr(>F)
1 30 278.32
2 28 188.49 2 89.83 6.672 0.00427 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Analysis of Variance Table
Model 1: mpg ~ wt + disp
Model 2: mpg ~ wt + disp + cyl
Res.Df RSS Df Sum of Sq F Pr(>F)
1 29 246.68
2 28 188.49 1 58.19 8.644 0.006512 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
1.4 以mpg为被解释变量, 选取适当的解释变量,构建一个多元回归模型,该模型中需要有一个交互项,以表格形式报告该模型的估计结果,并在表中列出模型的校正的R平方,给显著的系数标记上星号。
mtcars %>% lm(mpg ~ wt + disp + cyl + disp:cyl, .) %>%
tbl_regression(
estimate_fun = ~ style_number(.x, digits = 3),
pvalue_fun = ~ style_pvalue(.x, digits = 3)
) %>%
add_significance_stars(hide_p = FALSE,
thresholds = c(0.01, 0.05, 0.10),) %>%
bold_p(t = 0.10) %>%
add_glance_table(
include = c(nobs, adj.r.squared, p.value)
)
Characteristic | Beta1 | SE2 | p-value |
---|---|---|---|
wt | -2.737** | 1.05 | 0.014 |
disp | -0.087* | 0.043 | 0.053 |
cyl | -3.005*** | 0.780 | <0.001 |
disp * cyl | 0.011** | 0.005 | 0.031 |
No. Obs. | 32 | ||
Adjusted R² | 0.839 | ||
p-value | <0.001 | ||
1 *p<0.1; **p<0.05; ***p<0.01 | |||
2 SE = Standard Error |
1.5 用图形呈现1.4中估计的回归模型解释变量的Y的影响效应、解释变量系数的置信区间,并诊断该模型是否满足线性模型的假定?
library(performance)
mtcars %>% lm(mpg ~ wt + disp + cyl + disp:cyl, .) %>%
performance::check_model()
2 习题2
数据文件:load(“lbw.rda”)
2.1 将数据集lbw中的变量名全部转换为小写字母,并将数据集lbw复制给对象df,将df中的列id, 列bwt删除.
# 导入数据文件lbw.rda,该文件放置在项目所在的同一路径下。
# 若数据文lbw.rda没有放在与项目同一路径,需要在load()中指定数据文件的路径。
load("lbw.rda")
names(lbw) <- tolower(names(lbw))
df <- lbw
names(df) <- tolower(names(lbw))
df <- df %>% subset(select = -c(id, bwt))
2.2 对df中的变量进行转换:将ptl中等于2或3的组别,都合并到ptl等于1的组别中。将ftv中等于3,4,5,6的组别,都合并到ftv = 2的组别中。将low中等于”BWT <= 2500g”的组别,都转换为1,其余组别转换为0。
df$ptl <- ifelse(df$ptl == 2 | df$ptl == 3, 1, df$ptl) %>%
as.factor()
df$ftv <- ifelse(df$ftv == 3 | df$ftv == 4 |
df$ftv == 5| df$ftv == 6,
2,
df$ftv) %>%
as.factor()
df$low <- ifelse(df$low == "BWT <= 2500g", 1, 0)
2.3 以table1的形式报告df中各变量的描述性统计。
Characteristic | N = 1891 |
---|---|
low | 59 (31%) |
age | 23 (19, 26) |
lwt | 121 (110, 140) |
race | |
white | 96 (51%) |
black | 26 (14%) |
other | 67 (35%) |
smoke | 74 (39%) |
ptl | |
0 | 159 (84%) |
1 | 30 (16%) |
ht | 12 (6.3%) |
ui | 28 (15%) |
ftv | |
0 | 100 (53%) |
1 | 47 (25%) |
2 | 42 (22%) |
1 n (%); Median (IQR) |
2.4 建立若干个一元logistic回归模型,以low为因变量,age,lwt,race,smoke,ptl,ht,ui,ftv分别为自变量,以表格的形式。报告这些一元logistic回归的估计结果。
df %>%
gtsummary::tbl_uvregression(
method = glm,
y = low,
method.args = list(family = binomial),
exponentiate = FALSE
)
Characteristic | N | log(OR)1 | 95% CI1 | p-value |
---|---|---|---|---|
age | 189 | -0.05 | -0.12, 0.01 | 0.10 |
lwt | 189 | -0.01 | -0.03, 0.00 | 0.023 |
race | 189 | |||
white | — | — | ||
black | 0.84 | -0.08, 1.8 | 0.068 | |
other | 0.64 | -0.04, 1.3 | 0.067 | |
smoke | 189 | |||
no | — | — | ||
yes | 0.70 | 0.08, 1.3 | 0.028 | |
ptl | 189 | |||
0 | — | — | ||
1 | 1.5 | 0.66, 2.3 | <0.001 | |
ht | 189 | |||
no | — | — | ||
yes | 1.2 | 0.03, 2.5 | 0.046 | |
ui | 189 | |||
no | — | — | ||
yes | 0.95 | 0.13, 1.8 | 0.023 | |
ftv | 189 | |||
0 | — | — | ||
1 | -0.61 | -1.4, 0.16 | 0.13 | |
2 | -0.34 | -1.2, 0.43 | 0.4 | |
1 OR = Odds Ratio, CI = Confidence Interval |
df %>%
gtsummary::tbl_uvregression(
method = glm,
y = low,
method.args = list(family = binomial),
exponentiate = TRUE
)
Characteristic | N | OR1 | 95% CI1 | p-value |
---|---|---|---|---|
age | 189 | 0.95 | 0.89, 1.01 | 0.10 |
lwt | 189 | 0.99 | 0.97, 1.00 | 0.023 |
race | 189 | |||
white | — | — | ||
black | 2.33 | 0.93, 5.77 | 0.068 | |
other | 1.89 | 0.96, 3.76 | 0.067 | |
smoke | 189 | |||
no | — | — | ||
yes | 2.02 | 1.08, 3.80 | 0.028 | |
ptl | 189 | |||
0 | — | — | ||
1 | 4.32 | 1.94, 9.95 | <0.001 | |
ht | 189 | |||
no | — | — | ||
yes | 3.37 | 1.03, 11.8 | 0.046 | |
ui | 189 | |||
no | — | — | ||
yes | 2.58 | 1.13, 5.88 | 0.023 | |
ftv | 189 | |||
0 | — | — | ||
1 | 0.54 | 0.24, 1.17 | 0.13 | |
2 | 0.71 | 0.32, 1.53 | 0.4 | |
1 OR = Odds Ratio, CI = Confidence Interval |
2.5 建立多元logistic回归模型,解释变量包括age, lwt, smoke, ui, ptl,ht, ftv, 以及race. 以表格的形式报告多元logistic回归的估计结果。
glm(low ~ age + lwt+ smoke + ui + ptl + ht + ftv + race,
df, family = "binomial") %>% tbl_regression(exponentiate = TRUE, #被解释变量odds ratio
estimate_fun = ~ style_number(.x, digits = 3),
pvalue_fun = ~ style_pvalue(.x, digits = 3))
Characteristic | OR1 | 95% CI1 | p-value |
---|---|---|---|
age | 0.963 | 0.891, 1.038 | 0.336 |
lwt | 0.984 | 0.970, 0.998 | 0.027 |
smoke | |||
no | — | — | |
yes | 2.129 | 0.930, 4.975 | 0.075 |
ui | |||
no | — | — | |
yes | 1.974 | 0.785, 4.914 | 0.143 |
ptl | |||
0 | — | — | |
1 | 3.833 | 1.517, 10.129 | 0.005 |
ht | |||
no | — | — | |
yes | 6.775 | 1.715, 30.447 | 0.008 |
ftv | |||
0 | — | — | |
1 | 0.646 | 0.244, 1.621 | 0.363 |
2 | 1.196 | 0.482, 2.920 | 0.695 |
race | |||
white | — | — | |
black | 3.295 | 1.154, 9.600 | 0.026 |
other | 2.097 | 0.855, 5.281 | 0.109 |
1 OR = Odds Ratio, CI = Confidence Interval |
2.6 建立两个logistic回归模型,模型2的解释变量包括age, lwt, smoke, ui, ptl,ht, ftv, 以及race, 模型1的解释变量包括age, lwt, smoke, ui, ptl, 比较这两个模型。
model1 <- glm(low ~ age + lwt+ smoke + ui + ptl + ht + ftv + race,
df, family = "binomial")
model2 <- glm(low ~ age + lwt+ smoke + ui + ptl,
df, family = "binomial")
anova(model2, model1, test = "Chisq")
Analysis of Deviance Table
Model 1: low ~ age + lwt + smoke + ui + ptl
Model 2: low ~ age + lwt + smoke + ui + ptl + ht + ftv + race
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 183 210.72
2 178 195.48 5 15.244 0.00937 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
2.7 用图形呈现2.5中logistic多元回归模型解释变量系数的置信区间。
glm(low ~ age + lwt+ smoke + ui + ptl + ht + ftv + race,
df, family = "binomial") %>%
effects::allEffects() %>%
plot()
2.8 计算2.5中logistic多元回归模型在样本均值点的平均边际效应。
Call:
mfx::logitmfx(formula = low ~ age + lwt + smoke + ui + ptl +
ht + ftv + race, data = df)
Marginal Effects:
dF/dx Std. Err. z P>|z|
age -0.0073713 0.0076081 -0.9689 0.332612
lwt -0.0030988 0.0013817 -2.2428 0.024909 *
smokeyes 0.1542049 0.0875802 1.7607 0.078285 .
uiyes 0.1475577 0.1076834 1.3703 0.170595
ptl1 0.3045239 0.1136779 2.6788 0.007388 **
htyes 0.4428752 0.1519895 2.9139 0.003570 **
ftv1 -0.0818986 0.0849246 -0.9644 0.334861
ftv2 0.0362232 0.0941498 0.3847 0.700430
raceblack 0.2701769 0.1272684 2.1229 0.033763 *
raceother 0.1528024 0.0972130 1.5718 0.115990
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
dF/dx is for discrete change for the following variables:
[1] "smokeyes" "uiyes" "ptl1" "htyes" "ftv1" "ftv2"
[7] "raceblack" "raceother"