library(boot)
library(tidyverse)
library(survival)
library(survminer)
library(gt)
library(gtsummary)

1 习题1

数据文件:R自带数据mtcars

1.1 以mpg为被解释变量,建立若干个一元回归模型,解释变量为数据框mtcars中除了mpg之外的其余的变量。在同一张表格中报告这些一元回归模型的估计结果。

data(mtcars)
library(gtsummary)
library(gt)

mtcars %>% 
  tbl_uvregression(
    method = lm,
    y = mpg
  )
Characteristic N Beta 95% CI1 p-value
cyl 32 -2.9 -3.5, -2.2 <0.001
disp 32 -0.04 -0.05, -0.03 <0.001
hp 32 -0.07 -0.09, -0.05 <0.001
drat 32 7.7 4.6, 11 <0.001
wt 32 -5.3 -6.5, -4.2 <0.001
qsec 32 1.4 0.27, 2.6 0.017
vs 32 7.9 4.6, 11 <0.001
am 32 7.2 3.6, 11 <0.001
gear 32 3.9 1.3, 6.6 0.005
carb 32 -2.1 -3.2, -0.89 0.001
1 CI = Confidence Interval

1.2 以mpg为被解释变量, 选取适当的解释变量,做一元回归、二元回归、三元回归,在同一张表格中报告这3个模型的估计结果。

m1 <- mtcars %>% lm(mpg ~ wt, .) %>% 
  tbl_regression(
    estimate_fun = ~ style_number(.x, digits = 3),
    pvalue_fun = ~ style_pvalue(.x, digits = 3)
  ) %>% 
  add_significance_stars(hide_p = FALSE,
                         thresholds = c(0.01, 0.05, 0.10),) %>%
  bold_p(t = 0.10) %>%
  add_glance_table(
    include = c(nobs, adj.r.squared, p.value)
  )

m2 <- mtcars %>% lm(mpg ~ wt + disp, .) %>% 
  tbl_regression(
    estimate_fun = ~ style_number(.x, digits = 3),
    pvalue_fun = ~ style_pvalue(.x, digits = 3)
  ) %>% 
  add_significance_stars(hide_p = FALSE,
                         thresholds = c(0.01, 0.05, 0.10),) %>%
  bold_p(t = 0.10) %>%
  add_glance_table(
    include = c(nobs, adj.r.squared, p.value)
  )

m3 <- mtcars %>% lm(mpg ~ wt + disp + cyl, .) %>% 
  tbl_regression(
    estimate_fun = ~ style_number(.x, digits = 3),
    pvalue_fun = ~ style_pvalue(.x, digits = 3)
  ) %>% 
  add_significance_stars(hide_p = FALSE,
                         thresholds = c(0.01, 0.05, 0.10),) %>%
  bold_p(t = 0.10) %>%
  add_glance_table(
    include = c(nobs, adj.r.squared, p.value)
  )

tbl_merge(list(m1, m2, m3),
          tab_spanner = c("**Model 1**", "**Model 2**", "**Model 3**")) %>% 
  bold_labels() %>%
  italicize_levels()
Characteristic Model 1 Model 2 Model 3
Beta1 SE2 p-value Beta1 SE2 p-value Beta1 SE2 p-value
wt -5.344*** 0.559 <0.001 -3.351*** 1.16 0.007 -3.636*** 1.04 0.002
No. Obs. 32

32

32

Adjusted R² 0.745

0.766

0.815

p-value <0.001

<0.001

<0.001

disp


-0.018* 0.009 0.064 0.007 0.012 0.533
cyl





-1.785*** 0.607 0.007
1 *p<0.1; **p<0.05; ***p<0.01
2 SE = Standard Error

1.3 并用anova()函数对上述三个模型进行比较。

model1 <- mtcars %>% lm(mpg ~ wt, .)

model2 <- mtcars %>% lm(mpg ~ wt + disp, .)

model3 <- mtcars %>% lm(mpg ~ wt + disp + cyl, .)

anova(model1, model2)
Analysis of Variance Table

Model 1: mpg ~ wt
Model 2: mpg ~ wt + disp
  Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
1     30 278.32                              
2     29 246.68  1    31.639 3.7195 0.06362 .
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model1, model3)
Analysis of Variance Table

Model 1: mpg ~ wt
Model 2: mpg ~ wt + disp + cyl
  Res.Df    RSS Df Sum of Sq     F  Pr(>F)   
1     30 278.32                              
2     28 188.49  2     89.83 6.672 0.00427 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model2, model3)
Analysis of Variance Table

Model 1: mpg ~ wt + disp
Model 2: mpg ~ wt + disp + cyl
  Res.Df    RSS Df Sum of Sq     F   Pr(>F)   
1     29 246.68                               
2     28 188.49  1     58.19 8.644 0.006512 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

1.4 以mpg为被解释变量, 选取适当的解释变量,构建一个多元回归模型,该模型中需要有一个交互项,以表格形式报告该模型的估计结果,并在表中列出模型的校正的R平方,给显著的系数标记上星号。

 mtcars %>% lm(mpg ~ wt + disp + cyl + disp:cyl, .) %>% 
  tbl_regression(
    estimate_fun = ~ style_number(.x, digits = 3),
    pvalue_fun = ~ style_pvalue(.x, digits = 3)
  ) %>% 
  add_significance_stars(hide_p = FALSE,
                         thresholds = c(0.01, 0.05, 0.10),) %>%
  bold_p(t = 0.10) %>%
  add_glance_table(
    include = c(nobs, adj.r.squared, p.value)
  )
Characteristic Beta1 SE2 p-value
wt -2.737** 1.05 0.014
disp -0.087* 0.043 0.053
cyl -3.005*** 0.780 <0.001
disp * cyl 0.011** 0.005 0.031
No. Obs. 32

Adjusted R² 0.839

p-value <0.001

1 *p<0.1; **p<0.05; ***p<0.01
2 SE = Standard Error

1.5 用图形呈现1.4中估计的回归模型解释变量的Y的影响效应、解释变量系数的置信区间,并诊断该模型是否满足线性模型的假定?

library(effects)   
mtcars %>% lm(mpg ~ wt + disp + cyl + disp:cyl, .) %>%
  allEffects() %>%
  plot()

library(performance)    
mtcars %>% lm(mpg ~ wt + disp + cyl + disp:cyl, .)  %>% 
  performance::check_model()

2 习题2

数据文件:load(“lbw.rda”)

数据文件下载

2.1 将数据集lbw中的变量名全部转换为小写字母,并将数据集lbw复制给对象df,将df中的列id, 列bwt删除.

# 导入数据文件lbw.rda,该文件放置在项目所在的同一路径下。
# 若数据文lbw.rda没有放在与项目同一路径,需要在load()中指定数据文件的路径。
load("lbw.rda") 
names(lbw) <- tolower(names(lbw))
df <- lbw
names(df) <- tolower(names(lbw))
df <- df %>% subset(select = -c(id, bwt))

2.2 对df中的变量进行转换:将ptl中等于2或3的组别,都合并到ptl等于1的组别中。将ftv中等于3,4,5,6的组别,都合并到ftv = 2的组别中。将low中等于”BWT <= 2500g”的组别,都转换为1,其余组别转换为0。

df$ptl <- ifelse(df$ptl == 2 | df$ptl == 3, 1, df$ptl) %>% 
  as.factor()

df$ftv <- ifelse(df$ftv == 3 | df$ftv == 4 | 
                   df$ftv == 5| df$ftv == 6, 
                 2, 
                 df$ftv) %>% 
    as.factor()

df$low <- ifelse(df$low == "BWT <= 2500g", 1, 0)

2.3 以table1的形式报告df中各变量的描述性统计。

df %>% tbl_summary()
Characteristic N = 1891
low 59 (31%)
age 23 (19, 26)
lwt 121 (110, 140)
race
    white 96 (51%)
    black 26 (14%)
    other 67 (35%)
smoke 74 (39%)
ptl
    0 159 (84%)
    1 30 (16%)
ht 12 (6.3%)
ui 28 (15%)
ftv
    0 100 (53%)
    1 47 (25%)
    2 42 (22%)
1 n (%); Median (IQR)

2.4 建立若干个一元logistic回归模型,以low为因变量,age,lwt,race,smoke,ptl,ht,ui,ftv分别为自变量,以表格的形式。报告这些一元logistic回归的估计结果。

df %>% 
  gtsummary::tbl_uvregression(
    method = glm,
    y = low,
    method.args = list(family = binomial),
    exponentiate = FALSE
  )
Characteristic N log(OR)1 95% CI1 p-value
age 189 -0.05 -0.12, 0.01 0.10
lwt 189 -0.01 -0.03, 0.00 0.023
race 189


    white

    black
0.84 -0.08, 1.8 0.068
    other
0.64 -0.04, 1.3 0.067
smoke 189


    no

    yes
0.70 0.08, 1.3 0.028
ptl 189


    0

    1
1.5 0.66, 2.3 <0.001
ht 189


    no

    yes
1.2 0.03, 2.5 0.046
ui 189


    no

    yes
0.95 0.13, 1.8 0.023
ftv 189


    0

    1
-0.61 -1.4, 0.16 0.13
    2
-0.34 -1.2, 0.43 0.4
1 OR = Odds Ratio, CI = Confidence Interval
df %>% 
  gtsummary::tbl_uvregression(
    method = glm,
    y = low,
    method.args = list(family = binomial),
    exponentiate = TRUE
  )
Characteristic N OR1 95% CI1 p-value
age 189 0.95 0.89, 1.01 0.10
lwt 189 0.99 0.97, 1.00 0.023
race 189


    white

    black
2.33 0.93, 5.77 0.068
    other
1.89 0.96, 3.76 0.067
smoke 189


    no

    yes
2.02 1.08, 3.80 0.028
ptl 189


    0

    1
4.32 1.94, 9.95 <0.001
ht 189


    no

    yes
3.37 1.03, 11.8 0.046
ui 189


    no

    yes
2.58 1.13, 5.88 0.023
ftv 189


    0

    1
0.54 0.24, 1.17 0.13
    2
0.71 0.32, 1.53 0.4
1 OR = Odds Ratio, CI = Confidence Interval

2.5 建立多元logistic回归模型,解释变量包括age, lwt, smoke, ui, ptl,ht, ftv, 以及race. 以表格的形式报告多元logistic回归的估计结果。

glm(low ~ age + lwt+ smoke + ui + ptl + ht + ftv + race,
                 df, family = "binomial") %>% tbl_regression(exponentiate = TRUE,  #被解释变量odds ratio
              estimate_fun = ~ style_number(.x, digits = 3),
              pvalue_fun = ~ style_pvalue(.x, digits = 3))
Characteristic OR1 95% CI1 p-value
age 0.963 0.891, 1.038 0.336
lwt 0.984 0.970, 0.998 0.027
smoke


    no
    yes 2.129 0.930, 4.975 0.075
ui


    no
    yes 1.974 0.785, 4.914 0.143
ptl


    0
    1 3.833 1.517, 10.129 0.005
ht


    no
    yes 6.775 1.715, 30.447 0.008
ftv


    0
    1 0.646 0.244, 1.621 0.363
    2 1.196 0.482, 2.920 0.695
race


    white
    black 3.295 1.154, 9.600 0.026
    other 2.097 0.855, 5.281 0.109
1 OR = Odds Ratio, CI = Confidence Interval

2.6 建立两个logistic回归模型,模型2的解释变量包括age, lwt, smoke, ui, ptl,ht, ftv, 以及race, 模型1的解释变量包括age, lwt, smoke, ui, ptl, 比较这两个模型。

model1 <- glm(low ~ age + lwt+ smoke + ui + ptl + ht + ftv + race,
                 df, family = "binomial")

model2 <- glm(low ~ age + lwt+ smoke + ui + ptl,
                 df, family = "binomial")

anova(model2, model1, test = "Chisq") 
Analysis of Deviance Table

Model 1: low ~ age + lwt + smoke + ui + ptl
Model 2: low ~ age + lwt + smoke + ui + ptl + ht + ftv + race
  Resid. Df Resid. Dev Df Deviance Pr(>Chi)   
1       183     210.72                        
2       178     195.48  5   15.244  0.00937 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

2.7 用图形呈现2.5中logistic多元回归模型解释变量系数的置信区间。

glm(low ~ age + lwt+ smoke + ui + ptl + ht + ftv + race,
                 df, family = "binomial") %>% 
  effects::allEffects() %>%
  plot()

2.8 计算2.5中logistic多元回归模型在样本均值点的平均边际效应。

mfx::logitmfx(low ~ age + lwt+ smoke + ui + ptl + ht + ftv + race,
                 df)
Call:
mfx::logitmfx(formula = low ~ age + lwt + smoke + ui + ptl + 
    ht + ftv + race, data = df)

Marginal Effects:
               dF/dx  Std. Err.       z    P>|z|   
age       -0.0073713  0.0076081 -0.9689 0.332612   
lwt       -0.0030988  0.0013817 -2.2428 0.024909 * 
smokeyes   0.1542049  0.0875802  1.7607 0.078285 . 
uiyes      0.1475577  0.1076834  1.3703 0.170595   
ptl1       0.3045239  0.1136779  2.6788 0.007388 **
htyes      0.4428752  0.1519895  2.9139 0.003570 **
ftv1      -0.0818986  0.0849246 -0.9644 0.334861   
ftv2       0.0362232  0.0941498  0.3847 0.700430   
raceblack  0.2701769  0.1272684  2.1229 0.033763 * 
raceother  0.1528024  0.0972130  1.5718 0.115990   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

dF/dx is for discrete change for the following variables:

[1] "smokeyes"  "uiyes"     "ptl1"      "htyes"     "ftv1"      "ftv2"     
[7] "raceblack" "raceother"