本章复习R中常用的统计分析工具。
数据文件:mpg mpg是ggplot2包中自带的数据,调用前需要加载包ggplot2。
This dataset contains a subset of the fuel economy data that the EPA makes available on https://fueleconomy.gov/. It contains only models which had a new release every year between 1999 and 2008 - this was used as a proxy for the popularity of the car.
#加载ggplot2,tidyverse
library(ggplot2)
library(tidyverse)
#预览数据
data(mpg)
head(mpg)# A tibble: 6 × 11
  manufacturer model displ  year   cyl trans      drv     cty   hwy fl    class 
  <chr>        <chr> <dbl> <int> <int> <chr>      <chr> <int> <int> <chr> <chr> 
1 audi         a4      1.8  1999     4 auto(l5)   f        18    29 p     compa…
2 audi         a4      1.8  1999     4 manual(m5) f        21    29 p     compa…
3 audi         a4      2    2008     4 manual(m6) f        20    31 p     compa…
4 audi         a4      2    2008     4 auto(av)   f        21    30 p     compa…
5 audi         a4      2.8  1999     6 auto(l5)   f        16    26 p     compa…
6 audi         a4      2.8  1999     6 manual(m5) f        18    26 p     compa…
1 描述性统计分析——图形工具
1.1 定量变量
1.1.1 直方图 hist()
#直方图
hist(mpg$hwy)#设置坐标轴刻度范围 ylim=c(0,100), xlim=c(0, 50)
#breaks=设置分组边界
hist(mpg$hwy, 
     ylim=c(0,100), xlim=c(0, 50),
     breaks=seq(0,50,5),
     col=5)#添加图形标题 main="",坐标轴标题 xlab="", ylab=""
#坐标轴刻度水平放置 las=1
hist(mpg$hwy, 
     ylim=c(0,100), xlim=c(0, 50),
     breaks=seq(0,50,5), 
     col = 2,
     main="Histogram of Highway MPG",
     xlab="Miles per Gallon", ylab="Frequency",
     las=1)1.1.2 直方图 geom_histogram()
#调用ggplot2包
library(ggplot2)
mpg %>% 
  ggplot(aes(cty))+
  geom_histogram(col = 1, fill = 5)`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
mpg %>% 
  ggplot(aes(cty))+
  geom_histogram(col = 1, fill = 5, binwidth = 5)#分组直方图
#facet_wrap 切面,各个组别填充不同颜色
mpg %>% 
  ggplot(aes(cty, fill = drv))+
  geom_histogram()+
  facet_wrap(~drv, ncol = 1)`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#在mpg中追加一个新的变量transmission
mpg <- mpg %>% mutate(transmission = substr(trans,1,4))
mpg %>% 
  ggplot(aes(cty, fill = transmission))+
  geom_histogram()+
  facet_wrap(~transmission, ncol = 1)`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
1.1.3 箱线图 boxplot()
#箱线图
boxplot(mpg$hwy,main = "Boxplot of Highway MPG", las = 1,col = 4)#分组箱线图
#按dr分组后,绘制分组箱线图
#水平放置 horizontal = T
boxplot(mpg$hwy~mpg$drv, las = 1,col = "cyan",
        ylim = c(0, 50),
        main = "Boxplot of Highway MPG",
        ylab = "f = front-wheel drive, r = rear wheel drive, 4 = 4wd",
        xlab = "Miles per Gallon",
        horizontal = T
        )1.1.4 分组箱线图 geom_boxplot()
#水平boxplot
mpg %>% 
  ggplot(aes(cty))+
  geom_boxplot(fill = 4)#垂直boxplot
mpg %>% 
  ggplot(aes(cty))+
  geom_boxplot(fill = 6)+
  coord_flip()#分组箱线图
#facet_wrap 切面
#ncol=1 图形排成1列
mpg %>% 
  ggplot(aes(cty,col = drv))+
  geom_boxplot()+
  facet_wrap(~drv,ncol=1)#在aes()中设置X轴映射transmission
mpg %>% 
  ggplot(aes(transmission, cty,col = transmission))+
  geom_boxplot()1.2 定性变量
1.2.1 条形图 geom_bar()
mpg %>% ggplot(aes(manufacturer)) + 
  geom_bar(col = 5, fill = 5)mpg %>% ggplot(aes(drv)) + 
  geom_bar(col = 4, fill = 4)1.2.2 分组条形图 ggplot2::geom_bar()
mpg %>% ggplot(aes(drv)) + 
  geom_bar(col = 5, fill = 5)+
  facet_wrap(~year)mpg %>% ggplot(aes(transmission)) + 
  geom_bar(col = 4, fill = 4)+
  facet_wrap(~year)1.2.3 分组堆栈条形图
#position = "fill" 堆栈
mpg %>%
  ggplot(aes(transmission,fill = drv))+
  geom_bar(position = "fill",alpha=0.5)+
  theme_bw()+
  theme(panel.grid.major = element_blank(),
  panel.grid.minor = element_blank())+
  labs(title = "Transmission and Type of Drive",
  x = "Transmission",
  y = "Proportion")1.3 两个定量变量
1.3.1 普通散点图 plot()
plot(mpg$displ,mpg$cty,
     pch = 8,
     col = "blue",
     las = 1)plot(mpg$displ,mpg$cty, 
     pch = 8,
     col = "blue",
     las = 1,
     main = "City Miles per Gallon & Engine Displacement",
     xlab = "Engine Displacement in Litres",
     ylab = "City Miles per Gallon",
     cex.main = 1.5,
     cex.lab = 1.2, 
     cex.axis = 1,
     font.main = 1,
     font.lab = 2, 
     font.axis =3,
     xlim = c(0,8),
     ylim = c(0, 40),
     xaxt = "n",
     yaxt = "n") 
axis(1, at = seq(0,8,1), labels = seq(0,8,1))
axis(2, at = seq(0,40,2), labels = seq(0,40,2),las =1)1.3.2 矩阵散点图 plot()
#Scatter Matrix
mpg %>% 
  select(displ,cyl, cty,hwy) %>% 
  plot()1.3.3 分组散点图 ggplot2::geom_point()
#用drv映射点的颜色
mpg %>% ggplot(aes(displ,hwy,color = drv))+
         geom_point()+
         geom_smooth(method = lm)`geom_smooth()` using formula 'y ~ x'
#用trans映射点的颜色
mpg %>% ggplot(aes(displ,hwy,color = transmission))+
         geom_point()+
         geom_smooth(method = lm)`geom_smooth()` using formula 'y ~ x'
2 描述性统计分析——统计量的计算
2.1 单个统计量
mean(mpg$hwy)[1] 23.44017
median(mpg$hwy)[1] 24
sd(mpg$hwy)[1] 5.954643
max(mpg$hwy)[1] 44
min(mpg$hwy)[1] 12
quantile(mpg$hwy,probs = seq(0,1,0.25))  0%  25%  50%  75% 100% 
  12   18   24   27   44 
IQR(mpg$hwy)[1] 9
2.2 统计量的批量报告
2.2.1 summary()
summary {base} 定量变量:报告mean、five number summary。
summary(mpg) manufacturer          model               displ            year     
 Length:234         Length:234         Min.   :1.600   Min.   :1999  
 Class :character   Class :character   1st Qu.:2.400   1st Qu.:1999  
 Mode  :character   Mode  :character   Median :3.300   Median :2004  
                                       Mean   :3.472   Mean   :2004  
                                       3rd Qu.:4.600   3rd Qu.:2008  
                                       Max.   :7.000   Max.   :2008  
      cyl           trans               drv                 cty       
 Min.   :4.000   Length:234         Length:234         Min.   : 9.00  
 1st Qu.:4.000   Class :character   Class :character   1st Qu.:14.00  
 Median :6.000   Mode  :character   Mode  :character   Median :17.00  
 Mean   :5.889                                         Mean   :16.86  
 3rd Qu.:8.000                                         3rd Qu.:19.00  
 Max.   :8.000                                         Max.   :35.00  
      hwy             fl               class           transmission      
 Min.   :12.00   Length:234         Length:234         Length:234        
 1st Qu.:18.00   Class :character   Class :character   Class :character  
 Median :24.00   Mode  :character   Mode  :character   Mode  :character  
 Mean   :23.44                                                           
 3rd Qu.:27.00                                                           
 Max.   :44.00                                                           
2.2.2 psych::describe()
定量变量:统计量最全面 item name item number number of valid cases mean standard deviation trimmed mean (with trim defaulting to .1) median (standard or interpolated mad: median absolute deviation (from the median). minimum maximum skew kurtosis standard error 定性变量:将其转换为数值代码,再计算,结果往往没有意义。
library(psych)#批量报告统计量 
describe(mpg)              vars   n    mean    sd median trimmed   mad    min  max range
manufacturer*    1 234    7.76  5.13    6.0    7.68  5.93    1.0   15  14.0
model*           2 234   19.09 11.15   18.5   18.98 14.08    1.0   38  37.0
displ            3 234    3.47  1.29    3.3    3.39  1.33    1.6    7   5.4
year             4 234 2003.50  4.51 2003.5 2003.50  6.67 1999.0 2008   9.0
cyl              5 234    5.89  1.61    6.0    5.86  2.97    4.0    8   4.0
trans*           6 234    5.65  2.88    4.0    5.53  1.48    1.0   10   9.0
drv*             7 234    1.67  0.66    2.0    1.59  1.48    1.0    3   2.0
cty              8 234   16.86  4.26   17.0   16.61  4.45    9.0   35  26.0
hwy              9 234   23.44  5.95   24.0   23.23  7.41   12.0   44  32.0
fl*             10 234    4.63  0.70    5.0    4.77  0.00    1.0    5   4.0
class*          11 234    4.59  1.99    5.0    4.64  2.97    1.0    7   6.0
transmission*   12 234    1.33  0.47    1.0    1.29  0.00    1.0    2   1.0
               skew kurtosis   se
manufacturer*  0.21    -1.63 0.34
model*         0.11    -1.23 0.73
displ          0.44    -0.91 0.08
year           0.00    -2.01 0.29
cyl            0.11    -1.46 0.11
trans*         0.29    -1.65 0.19
drv*           0.48    -0.76 0.04
cty            0.79     1.43 0.28
hwy            0.36     0.14 0.39
fl*           -2.25     5.76 0.05
class*        -0.14    -1.52 0.13
transmission*  0.72    -1.48 0.03
#分组统计量
describeBy(mpg ~ cyl)
 Descriptive statistics by group 
cyl: 4
              vars  n    mean   sd median trimmed  mad    min    max range
manufacturer*    1 81    6.30 2.54      7    6.62 2.97    1.0    9.0   8.0
model*           2 81   10.12 5.03     10   10.12 5.93    1.0   19.0  18.0
displ            3 81    2.15 0.32      2    2.14 0.30    1.6    2.7   1.1
year             4 81 2003.00 4.50   1999 2002.88 0.00 1999.0 2008.0   9.0
cyl              5 81    4.00 0.00      4    4.00 0.00    4.0    4.0   0.0
trans*           6 81    5.84 2.55      7    5.91 2.97    1.0    9.0   8.0
drv*             7 81    1.72 0.45      2    1.77 0.00    1.0    2.0   1.0
cty              8 81   21.01 3.50     21   20.62 2.97   15.0   35.0  20.0
hwy              9 81   28.80 4.52     29   28.49 2.97   20.0   44.0  24.0
fl*             10 81    3.62 0.62      4    3.72 0.00    1.0    4.0   3.0
class*          11 81    2.86 1.94      2    2.71 1.48    1.0    6.0   5.0
transmission*   12 81    1.49 0.50      1    1.49 0.00    1.0    2.0   1.0
               skew kurtosis   se
manufacturer* -0.81    -0.51 0.28
model*         0.00    -1.02 0.56
displ          0.07    -1.23 0.04
year           0.22    -1.98 0.50
cyl             NaN      NaN 0.00
trans*        -0.27    -1.66 0.28
drv*          -0.94    -1.13 0.05
cty            1.46     3.29 0.39
hwy            0.95     2.16 0.50
fl*           -1.66     2.85 0.07
class*         0.40    -1.60 0.22
transmission*  0.02    -2.02 0.06
------------------------------------------------------------ 
cyl: 5
              vars n    mean   sd median trimmed  mad    min    max range  skew
manufacturer*    1 4    1.00 0.00    1.0    1.00 0.00    1.0    1.0     0   NaN
model*           2 4    1.50 0.58    1.5    1.50 0.74    1.0    2.0     1  0.00
displ            3 4    2.50 0.00    2.5    2.50 0.00    2.5    2.5     0   NaN
year             4 4 2008.00 0.00 2008.0 2008.00 0.00 2008.0 2008.0     0   NaN
cyl              5 4    5.00 0.00    5.0    5.00 0.00    5.0    5.0     0   NaN
trans*           6 4    1.50 0.58    1.5    1.50 0.74    1.0    2.0     1  0.00
drv*             7 4    1.00 0.00    1.0    1.00 0.00    1.0    1.0     0   NaN
cty              8 4   20.50 0.58   20.5   20.50 0.74   20.0   21.0     1  0.00
hwy              9 4   28.75 0.50   29.0   28.75 0.00   28.0   29.0     1 -0.75
fl*             10 4    1.00 0.00    1.0    1.00 0.00    1.0    1.0     0   NaN
class*          11 4    1.50 0.58    1.5    1.50 0.74    1.0    2.0     1  0.00
transmission*   12 4    1.50 0.58    1.5    1.50 0.74    1.0    2.0     1  0.00
              kurtosis   se
manufacturer*      NaN 0.00
model*           -2.44 0.29
displ              NaN 0.00
year               NaN 0.00
cyl                NaN 0.00
trans*           -2.44 0.29
drv*               NaN 0.00
cty              -2.44 0.29
hwy              -1.69 0.25
fl*                NaN 0.00
class*           -2.44 0.29
transmission*    -2.44 0.29
------------------------------------------------------------ 
cyl: 6
              vars  n    mean   sd median trimmed  mad    min    max range
manufacturer*    1 79    5.80 3.36    5.0    5.77 4.45    1.0   11.0  10.0
model*           2 79   12.75 7.31   12.0   12.66 8.90    1.0   25.0  24.0
displ            3 79    3.41 0.47    3.4    3.42 0.59    2.5    4.2   1.7
year             4 79 2002.87 4.48 1999.0 2002.74 0.00 1999.0 2008.0   9.0
cyl              5 79    6.00 0.00    6.0    6.00 0.00    6.0    6.0   0.0
trans*           6 79    4.03 2.33    3.0    3.89 1.48    1.0    8.0   7.0
drv*             7 79    1.65 0.58    2.0    1.62 0.00    1.0    3.0   2.0
cty              8 79   16.22 1.77   16.0   16.28 1.48   11.0   19.0   8.0
hwy              9 79   22.82 3.69   24.0   22.92 2.97   17.0   29.0  12.0
fl*             10 79    3.72 0.55    4.0    3.82 0.00    1.0    4.0   3.0
class*          11 79    3.29 1.79    3.0    3.25 1.48    1.0    6.0   5.0
transmission*   12 79    1.29 0.46    1.0    1.25 0.00    1.0    2.0   1.0
               skew kurtosis   se
manufacturer*  0.16    -1.46 0.38
model*         0.14    -1.26 0.82
displ         -0.09    -1.30 0.05
year           0.28    -1.95 0.50
cyl             NaN      NaN 0.00
trans*         0.49    -1.49 0.26
drv*           0.21    -0.77 0.07
cty           -0.38    -0.50 0.20
hwy           -0.41    -1.30 0.41
fl*           -2.27     6.27 0.06
class*         0.33    -1.35 0.20
transmission*  0.90    -1.20 0.05
------------------------------------------------------------ 
cyl: 8
              vars  n    mean   sd median trimmed  mad  min  max range  skew
manufacturer*    1 70    4.14 2.32    3.0    3.73 1.48    1   11    10  1.48
model*           2 70   10.57 5.54   10.0   10.57 7.41    1   19    18  0.09
displ            3 70    5.13 0.59    5.2    5.09 0.74    4    7     3  0.64
year             4 70 2004.53 4.41 2008.0 2004.79 0.00 1999 2008     9 -0.46
cyl              5 70    8.00 0.00    8.0    8.00 0.00    8    8     0   NaN
trans*           6 70    2.94 2.53    2.0    2.55 1.48    1    8     7  1.02
drv*             7 70    1.61 0.92    1.0    1.52 0.00    1    3     2  0.82
cty              8 70   12.57 1.81   13.0   12.57 2.22    9   16     7 -0.03
hwy              9 70   17.63 3.26   17.0   17.39 2.97   12   26    14  0.72
fl*             10 70    3.57 0.73    4.0    3.73 0.00    1    4     3 -1.55
class*          11 70    3.99 1.27    5.0    4.20 0.00    1    5     4 -0.90
transmission*   12 70    1.17 0.38    1.0    1.09 0.00    1    2     1  1.71
              kurtosis   se
manufacturer*     1.65 0.28
model*           -1.42 0.66
displ             0.11 0.07
year             -1.81 0.53
cyl                NaN 0.00
trans*           -0.62 0.30
drv*             -1.33 0.11
cty              -0.63 0.22
hwy               0.37 0.39
fl*               1.41 0.09
class*           -0.31 0.15
transmission*     0.93 0.05
2.2.3 Hmisc::describe()
优点:报告定性变量的分布
library(Hmisc)describe(mpg)mpg 
 12  Variables      234  Observations
--------------------------------------------------------------------------------
manufacturer 
       n  missing distinct 
     234        0       15 
lowest : audi       chevrolet  dodge      ford       honda     
highest: nissan     pontiac    subaru     toyota     volkswagen
audi (18, 0.077), chevrolet (19, 0.081), dodge (37, 0.158), ford (25, 0.107),
honda (9, 0.038), hyundai (14, 0.060), jeep (8, 0.034), land rover (4, 0.017),
lincoln (3, 0.013), mercury (4, 0.017), nissan (13, 0.056), pontiac (5, 0.021),
subaru (14, 0.060), toyota (34, 0.145), volkswagen (27, 0.115)
--------------------------------------------------------------------------------
model 
       n  missing distinct 
     234        0       38 
lowest : 4runner 4wd         a4                  a4 quattro          a6 quattro          altima             
highest: ram 1500 pickup 4wd range rover         sonata              tiburon             toyota tacoma 4wd  
--------------------------------------------------------------------------------
displ 
       n  missing distinct     Info     Mean      Gmd      .05      .10 
     234        0       35    0.997    3.472    1.471      1.8      2.0 
     .25      .50      .75      .90      .95 
     2.4      3.3      4.6      5.4      5.7 
lowest : 1.6 1.8 1.9 2.0 2.2, highest: 6.0 6.1 6.2 6.5 7.0
--------------------------------------------------------------------------------
year 
       n  missing distinct     Info     Mean      Gmd 
     234        0        2     0.75     2004    4.519 
                    
Value      1999 2008
Frequency   117  117
Proportion  0.5  0.5
--------------------------------------------------------------------------------
cyl 
       n  missing distinct     Info     Mean      Gmd 
     234        0        4    0.893    5.889    1.761 
                                  
Value          4     5     6     8
Frequency     81     4    79    70
Proportion 0.346 0.017 0.338 0.299
--------------------------------------------------------------------------------
trans 
       n  missing distinct 
     234        0       10 
lowest : auto(av)   auto(l3)   auto(l4)   auto(l5)   auto(l6)  
highest: auto(s4)   auto(s5)   auto(s6)   manual(m5) manual(m6)
                                                                            
Value        auto(av)   auto(l3)   auto(l4)   auto(l5)   auto(l6)   auto(s4)
Frequency           5          2         83         39          6          3
Proportion      0.021      0.009      0.355      0.167      0.026      0.013
                                                      
Value        auto(s5)   auto(s6) manual(m5) manual(m6)
Frequency           3         16         58         19
Proportion      0.013      0.068      0.248      0.081
--------------------------------------------------------------------------------
drv 
       n  missing distinct 
     234        0        3 
                            
Value          4     f     r
Frequency    103   106    25
Proportion 0.440 0.453 0.107
--------------------------------------------------------------------------------
cty 
       n  missing distinct     Info     Mean      Gmd      .05      .10 
     234        0       21    0.993    16.86    4.686       11       11 
     .25      .50      .75      .90      .95 
      14       17       19       21       24 
lowest :  9 11 12 13 14, highest: 26 28 29 33 35
--------------------------------------------------------------------------------
hwy 
       n  missing distinct     Info     Mean      Gmd      .05      .10 
     234        0       27    0.993    23.44    6.668     15.0     16.3 
     .25      .50      .75      .90      .95 
    18.0     24.0     27.0     30.0     32.0 
lowest : 12 14 15 16 17, highest: 35 36 37 41 44
--------------------------------------------------------------------------------
fl 
       n  missing distinct 
     234        0        5 
lowest : c d e p r, highest: c d e p r
                                        
Value          c     d     e     p     r
Frequency      1     5     8    52   168
Proportion 0.004 0.021 0.034 0.222 0.718
--------------------------------------------------------------------------------
class 
       n  missing distinct 
     234        0        7 
lowest : 2seater    compact    midsize    minivan    pickup    
highest: midsize    minivan    pickup     subcompact suv       
                                                                            
Value         2seater    compact    midsize    minivan     pickup subcompact
Frequency           5         47         41         11         33         35
Proportion      0.021      0.201      0.175      0.047      0.141      0.150
                     
Value             suv
Frequency          62
Proportion      0.265
--------------------------------------------------------------------------------
transmission 
       n  missing distinct 
     234        0        2 
                      
Value       auto  manu
Frequency    157    77
Proportion 0.671 0.329
--------------------------------------------------------------------------------
2.2.4 pastecs::stat.desc()
优点:报告了置信区间、离散系数
library(pastecs)stat.desc(mpg)         manufacturer model      displ         year          cyl trans drv
nbr.val            NA    NA 234.000000 2.340000e+02  234.0000000    NA  NA
nbr.null           NA    NA   0.000000 0.000000e+00    0.0000000    NA  NA
nbr.na             NA    NA   0.000000 0.000000e+00    0.0000000    NA  NA
min                NA    NA   1.600000 1.999000e+03    4.0000000    NA  NA
max                NA    NA   7.000000 2.008000e+03    8.0000000    NA  NA
range              NA    NA   5.400000 9.000000e+00    4.0000000    NA  NA
sum                NA    NA 812.400000 4.688190e+05 1378.0000000    NA  NA
median             NA    NA   3.300000 2.003500e+03    6.0000000    NA  NA
mean               NA    NA   3.471795 2.003500e+03    5.8888889    NA  NA
SE.mean            NA    NA   0.084458 2.948048e-01    0.1053493    NA  NA
CI.mean            NA    NA   0.166399 5.808237e-01    0.2075589    NA  NA
var                NA    NA   1.669158 2.033691e+01    2.5970434    NA  NA
std.dev            NA    NA   1.291959 4.509646e+00    1.6115345    NA  NA
coef.var           NA    NA   0.372130 2.250884e-03    0.2736568    NA  NA
                  cty          hwy fl class transmission
nbr.val   234.0000000  234.0000000 NA    NA           NA
nbr.null    0.0000000    0.0000000 NA    NA           NA
nbr.na      0.0000000    0.0000000 NA    NA           NA
min         9.0000000   12.0000000 NA    NA           NA
max        35.0000000   44.0000000 NA    NA           NA
range      26.0000000   32.0000000 NA    NA           NA
sum      3945.0000000 5485.0000000 NA    NA           NA
median     17.0000000   24.0000000 NA    NA           NA
mean       16.8589744   23.4401709 NA    NA           NA
SE.mean     0.2782199    0.3892672 NA    NA           NA
CI.mean     0.5481481    0.7669333 NA    NA           NA
var        18.1130736   35.4577785 NA    NA           NA
std.dev     4.2559457    5.9546434 NA    NA           NA
coef.var    0.2524439    0.2540358 NA    NA           NA
the number of values (nbr.val) the number of null values (nbr.null) the number of missing values (nbr.na) the minimal value (min) the maximal value (max) the range (range, that is, max-min) the sum of all non-missing values (sum) the median (median) the mean (mean) the standard error on the mean (SE.mean) the confidence interval of the mean (CI.mean) at the p level the variance (var) the standard deviation (std.dev) the variation coefficient (coef.var) defined as the standard deviation divided by the mean
2.2.5 频数分布表 tbl_summary()
#加载包gtsummary
library(gtsummary)tbl_summary(mpg)| Characteristic | N = 2341 | 
|---|---|
| manufacturer | |
| audi | 18 (7.7%) | 
| chevrolet | 19 (8.1%) | 
| dodge | 37 (16%) | 
| ford | 25 (11%) | 
| honda | 9 (3.8%) | 
| hyundai | 14 (6.0%) | 
| jeep | 8 (3.4%) | 
| land rover | 4 (1.7%) | 
| lincoln | 3 (1.3%) | 
| mercury | 4 (1.7%) | 
| nissan | 13 (5.6%) | 
| pontiac | 5 (2.1%) | 
| subaru | 14 (6.0%) | 
| toyota | 34 (15%) | 
| volkswagen | 27 (12%) | 
| model | |
| 4runner 4wd | 6 (2.6%) | 
| a4 | 7 (3.0%) | 
| a4 quattro | 8 (3.4%) | 
| a6 quattro | 3 (1.3%) | 
| altima | 6 (2.6%) | 
| c1500 suburban 2wd | 5 (2.1%) | 
| camry | 7 (3.0%) | 
| camry solara | 7 (3.0%) | 
| caravan 2wd | 11 (4.7%) | 
| civic | 9 (3.8%) | 
| corolla | 5 (2.1%) | 
| corvette | 5 (2.1%) | 
| dakota pickup 4wd | 9 (3.8%) | 
| durango 4wd | 7 (3.0%) | 
| expedition 2wd | 3 (1.3%) | 
| explorer 4wd | 6 (2.6%) | 
| f150 pickup 4wd | 7 (3.0%) | 
| forester awd | 6 (2.6%) | 
| grand cherokee 4wd | 8 (3.4%) | 
| grand prix | 5 (2.1%) | 
| gti | 5 (2.1%) | 
| impreza awd | 8 (3.4%) | 
| jetta | 9 (3.8%) | 
| k1500 tahoe 4wd | 4 (1.7%) | 
| land cruiser wagon 4wd | 2 (0.9%) | 
| malibu | 5 (2.1%) | 
| maxima | 3 (1.3%) | 
| mountaineer 4wd | 4 (1.7%) | 
| mustang | 9 (3.8%) | 
| navigator 2wd | 3 (1.3%) | 
| new beetle | 6 (2.6%) | 
| passat | 7 (3.0%) | 
| pathfinder 4wd | 4 (1.7%) | 
| ram 1500 pickup 4wd | 10 (4.3%) | 
| range rover | 4 (1.7%) | 
| sonata | 7 (3.0%) | 
| tiburon | 7 (3.0%) | 
| toyota tacoma 4wd | 7 (3.0%) | 
| displ | 3.30 (2.40, 4.60) | 
| year | |
| 1999 | 117 (50%) | 
| 2008 | 117 (50%) | 
| cyl | |
| 4 | 81 (35%) | 
| 5 | 4 (1.7%) | 
| 6 | 79 (34%) | 
| 8 | 70 (30%) | 
| trans | |
| auto(av) | 5 (2.1%) | 
| auto(l3) | 2 (0.9%) | 
| auto(l4) | 83 (35%) | 
| auto(l5) | 39 (17%) | 
| auto(l6) | 6 (2.6%) | 
| auto(s4) | 3 (1.3%) | 
| auto(s5) | 3 (1.3%) | 
| auto(s6) | 16 (6.8%) | 
| manual(m5) | 58 (25%) | 
| manual(m6) | 19 (8.1%) | 
| drv | |
| 4 | 103 (44%) | 
| f | 106 (45%) | 
| r | 25 (11%) | 
| cty | 17 (14, 19) | 
| hwy | 24 (18, 27) | 
| fl | |
| c | 1 (0.4%) | 
| d | 5 (2.1%) | 
| e | 8 (3.4%) | 
| p | 52 (22%) | 
| r | 168 (72%) | 
| class | |
| 2seater | 5 (2.1%) | 
| compact | 47 (20%) | 
| midsize | 41 (18%) | 
| minivan | 11 (4.7%) | 
| pickup | 33 (14%) | 
| subcompact | 35 (15%) | 
| suv | 62 (26%) | 
| transmission | |
| auto | 157 (67%) | 
| manu | 77 (33%) | 
| 1 n (%); Median (IQR) | |
2.2.6 学术论文表格 tbl_summary()
library(kableExtra)data <- mpg %>% 
  select(displ, hwy,cty)
#data %>% 
#  kable(col.names = c("displacement","highway mpg","city mpg")) %>% 
#  scroll_box(width = "500px", height = "200px")
table1 <- data.frame(Mean = colMeans(data),
             SD = sapply(data, sd),
             Median = sapply(data, median),
             Min = sapply(data, min),
             Max = sapply(data, max)) 
table1           Mean       SD Median  Min Max
displ  3.471795 1.291959    3.3  1.6   7
hwy   23.440171 5.954643   24.0 12.0  44
cty   16.858974 4.255946   17.0  9.0  35
#kbl()设置表格格式
#修改列名col.names = c("Mean", "SD","Median", "Minimum", "Maximum")
#设置小数位数digits = 3
#设置对齐方式 align = "c"
table1 %>% 
  kbl(col.names = c("Mean", "SD","Median", "Minimum", "Maximum"),
      digits = 3,
      align = "c") %>% 
  kable_styling()| Mean | SD | Median | Minimum | Maximum | |
|---|---|---|---|---|---|
| displ | 3.472 | 1.292 | 3.3 | 1.6 | 7 | 
| hwy | 23.440 | 5.955 | 24.0 | 12.0 | 44 | 
| cty | 16.859 | 4.256 | 17.0 | 9.0 | 35 | 
table1           Mean       SD Median  Min Max
displ  3.471795 1.291959    3.3  1.6   7
hwy   23.440171 5.954643   24.0 12.0  44
cty   16.858974 4.255946   17.0  9.0  35
#占位符.
#管道符特殊用法:不将data传导至函数的第一项参数,将函数用{}括起来
table2 <- data %>% 
  {data.frame(Mean = colMeans(.),
             SD = sapply(., sd),
             Median = sapply(., median),
             Min = sapply(., min),
             Max = sapply(., max))}
table2           Mean       SD Median  Min Max
displ  3.471795 1.291959    3.3  1.6   7
hwy   23.440171 5.954643   24.0 12.0  44
cty   16.858974 4.255946   17.0  9.0  35
3 相关分析
3.1 相关系数 cor()
cor(mpg$displ, mpg$cty)[1] -0.798524
mpg %>% 
  select(displ, cty, hwy) %>% 
  cor() %>% 
  round(digits = 3)       displ    cty    hwy
displ  1.000 -0.799 -0.766
cty   -0.799  1.000  0.956
hwy   -0.766  0.956  1.000
3.2 相关系数显著性检验
3.2.1 cor.test()
cor.test(mpg$displ, mpg$hwy)
    Pearson's product-moment correlation
data:  mpg$displ and mpg$hwy
t = -18.151, df = 232, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.8142727 -0.7072539
sample estimates:
     cor 
-0.76602 
cor.test(mpg$displ, mpg$cty)
    Pearson's product-moment correlation
data:  mpg$displ and mpg$cty
t = -20.205, df = 232, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.8406782 -0.7467508
sample estimates:
      cor 
-0.798524 
3.2.2 Hmisc::rcorr()
library(Hmisc)
correl.matrix <- data %>% 
  as.matrix() %>% 
  rcorr()
correl.matrix      displ   hwy   cty
displ  1.00 -0.77 -0.80
hwy   -0.77  1.00  0.96
cty   -0.80  0.96  1.00
n= 234 
P
      displ hwy cty
displ        0   0 
hwy    0         0 
cty    0     0     
attributes(correl.matrix)$names
[1] "r" "n" "P"
$class
[1] "rcorr"
round(correl.matrix$r, digits = 3)       displ    hwy    cty
displ  1.000 -0.766 -0.799
hwy   -0.766  1.000  0.956
cty   -0.799  0.956  1.000
round(correl.matrix$P, digits = 3)      displ hwy cty
displ    NA   0   0
hwy       0  NA   0
cty       0   0  NA
3.3 相关系数可视化
3.3.1 corrpolot::corrplot()
library(corrplot)data %>% 
  cor() %>% 
  corrplot()data %>% 
  cor() %>% 
  corrplot(method = "square",
           type = "upper")data %>% 
  cor() %>% 
  corrplot(method = "number",
           type = "lower")3.3.2 ggstatsplot::ggscatterstats
library(ggstatsplot)
library(tidyverse)data %>% 
  ggscatterstats(x = displ,
  y = hwy,
  bf.message = FALSE
)4 本章习题
数据文件:mtcars
1.查看mtcars的帮助文件,简要解释每个变量的含义。
2.用图形展示mpg的分布。(提示:直方图、箱线图)
3.将汽车按vs分组,用图形比较两个组别的汽车mpg的分布。(提示:分组直方图、分组箱线图)
4.将汽车按am分组,用图形比较两个组别的汽车mpg的分布。(提示:分组直方图、分组箱线图)
5.将汽车按vs车分组,分别绘制两个组别wt的箱线图。
6.用图形展示vs的分布。(提示:条形图)
7.用图形展示am的分布。(提示:条形图)
8.用图形展示vs、am的分布。(提示:分组条形图、分组堆栈条形图)
9.用图形展示mpg和wt的关系。(提示:散点图)
10.用图形展示mpg和disp的关系。(提示:散点图)
11.用图形展示mpg,disp,hp两两之间的关系。(提示:矩阵散点图)
12.将汽车按am分成两组,绘制mpg和disp的分组散点图。
13.将汽车按vs分成两组,绘制mpg和wt的分组散点图。
14.报告定量变量的均值、中位数、标准差、最大值和最小值。
- 报告mpg, disp,hp,wt, drat五个变量两两之间的相关系数,并用可视化工具呈现其相关系数。(提示:corrpolot::corrplot())
 
答题要求:
标好题号,将代码和输出图形/结果复制导word文档,再word文档转换成图片在91速课平台提交。