本章复习R中常用的统计分析工具。
数据文件:mpg mpg是ggplot2包中自带的数据,调用前需要加载包ggplot2。
This dataset contains a subset of the fuel economy data that the EPA makes available on https://fueleconomy.gov/. It contains only models which had a new release every year between 1999 and 2008 - this was used as a proxy for the popularity of the car.
#加载ggplot2,tidyverse
library(ggplot2)
library(tidyverse)
#预览数据
data(mpg)
head(mpg)# A tibble: 6 × 11
manufacturer model displ year cyl trans drv cty hwy fl class
<chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
1 audi a4 1.8 1999 4 auto(l5) f 18 29 p compa…
2 audi a4 1.8 1999 4 manual(m5) f 21 29 p compa…
3 audi a4 2 2008 4 manual(m6) f 20 31 p compa…
4 audi a4 2 2008 4 auto(av) f 21 30 p compa…
5 audi a4 2.8 1999 6 auto(l5) f 16 26 p compa…
6 audi a4 2.8 1999 6 manual(m5) f 18 26 p compa…
1 描述性统计分析——图形工具
1.1 定量变量
1.1.1 直方图 hist()
#直方图
hist(mpg$hwy)#设置坐标轴刻度范围 ylim=c(0,100), xlim=c(0, 50)
#breaks=设置分组边界
hist(mpg$hwy,
ylim=c(0,100), xlim=c(0, 50),
breaks=seq(0,50,5),
col=5)#添加图形标题 main="",坐标轴标题 xlab="", ylab=""
#坐标轴刻度水平放置 las=1
hist(mpg$hwy,
ylim=c(0,100), xlim=c(0, 50),
breaks=seq(0,50,5),
col = 2,
main="Histogram of Highway MPG",
xlab="Miles per Gallon", ylab="Frequency",
las=1)1.1.2 直方图 geom_histogram()
#调用ggplot2包
library(ggplot2)
mpg %>%
ggplot(aes(cty))+
geom_histogram(col = 1, fill = 5)`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
mpg %>%
ggplot(aes(cty))+
geom_histogram(col = 1, fill = 5, binwidth = 5)#分组直方图
#facet_wrap 切面,各个组别填充不同颜色
mpg %>%
ggplot(aes(cty, fill = drv))+
geom_histogram()+
facet_wrap(~drv, ncol = 1)`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#在mpg中追加一个新的变量transmission
mpg <- mpg %>% mutate(transmission = substr(trans,1,4))
mpg %>%
ggplot(aes(cty, fill = transmission))+
geom_histogram()+
facet_wrap(~transmission, ncol = 1)`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
1.1.3 箱线图 boxplot()
#箱线图
boxplot(mpg$hwy,main = "Boxplot of Highway MPG", las = 1,col = 4)#分组箱线图
#按dr分组后,绘制分组箱线图
#水平放置 horizontal = T
boxplot(mpg$hwy~mpg$drv, las = 1,col = "cyan",
ylim = c(0, 50),
main = "Boxplot of Highway MPG",
ylab = "f = front-wheel drive, r = rear wheel drive, 4 = 4wd",
xlab = "Miles per Gallon",
horizontal = T
)1.1.4 分组箱线图 geom_boxplot()
#水平boxplot
mpg %>%
ggplot(aes(cty))+
geom_boxplot(fill = 4)#垂直boxplot
mpg %>%
ggplot(aes(cty))+
geom_boxplot(fill = 6)+
coord_flip()#分组箱线图
#facet_wrap 切面
#ncol=1 图形排成1列
mpg %>%
ggplot(aes(cty,col = drv))+
geom_boxplot()+
facet_wrap(~drv,ncol=1)#在aes()中设置X轴映射transmission
mpg %>%
ggplot(aes(transmission, cty,col = transmission))+
geom_boxplot()1.2 定性变量
1.2.1 条形图 geom_bar()
mpg %>% ggplot(aes(manufacturer)) +
geom_bar(col = 5, fill = 5)mpg %>% ggplot(aes(drv)) +
geom_bar(col = 4, fill = 4)1.2.2 分组条形图 ggplot2::geom_bar()
mpg %>% ggplot(aes(drv)) +
geom_bar(col = 5, fill = 5)+
facet_wrap(~year)mpg %>% ggplot(aes(transmission)) +
geom_bar(col = 4, fill = 4)+
facet_wrap(~year)1.2.3 分组堆栈条形图
#position = "fill" 堆栈
mpg %>%
ggplot(aes(transmission,fill = drv))+
geom_bar(position = "fill",alpha=0.5)+
theme_bw()+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())+
labs(title = "Transmission and Type of Drive",
x = "Transmission",
y = "Proportion")1.3 两个定量变量
1.3.1 普通散点图 plot()
plot(mpg$displ,mpg$cty,
pch = 8,
col = "blue",
las = 1)plot(mpg$displ,mpg$cty,
pch = 8,
col = "blue",
las = 1,
main = "City Miles per Gallon & Engine Displacement",
xlab = "Engine Displacement in Litres",
ylab = "City Miles per Gallon",
cex.main = 1.5,
cex.lab = 1.2,
cex.axis = 1,
font.main = 1,
font.lab = 2,
font.axis =3,
xlim = c(0,8),
ylim = c(0, 40),
xaxt = "n",
yaxt = "n")
axis(1, at = seq(0,8,1), labels = seq(0,8,1))
axis(2, at = seq(0,40,2), labels = seq(0,40,2),las =1)1.3.2 矩阵散点图 plot()
#Scatter Matrix
mpg %>%
select(displ,cyl, cty,hwy) %>%
plot()1.3.3 分组散点图 ggplot2::geom_point()
#用drv映射点的颜色
mpg %>% ggplot(aes(displ,hwy,color = drv))+
geom_point()+
geom_smooth(method = lm)`geom_smooth()` using formula 'y ~ x'
#用trans映射点的颜色
mpg %>% ggplot(aes(displ,hwy,color = transmission))+
geom_point()+
geom_smooth(method = lm)`geom_smooth()` using formula 'y ~ x'
2 描述性统计分析——统计量的计算
2.1 单个统计量
mean(mpg$hwy)[1] 23.44017
median(mpg$hwy)[1] 24
sd(mpg$hwy)[1] 5.954643
max(mpg$hwy)[1] 44
min(mpg$hwy)[1] 12
quantile(mpg$hwy,probs = seq(0,1,0.25)) 0% 25% 50% 75% 100%
12 18 24 27 44
IQR(mpg$hwy)[1] 9
2.2 统计量的批量报告
2.2.1 summary()
summary {base} 定量变量:报告mean、five number summary。
summary(mpg) manufacturer model displ year
Length:234 Length:234 Min. :1.600 Min. :1999
Class :character Class :character 1st Qu.:2.400 1st Qu.:1999
Mode :character Mode :character Median :3.300 Median :2004
Mean :3.472 Mean :2004
3rd Qu.:4.600 3rd Qu.:2008
Max. :7.000 Max. :2008
cyl trans drv cty
Min. :4.000 Length:234 Length:234 Min. : 9.00
1st Qu.:4.000 Class :character Class :character 1st Qu.:14.00
Median :6.000 Mode :character Mode :character Median :17.00
Mean :5.889 Mean :16.86
3rd Qu.:8.000 3rd Qu.:19.00
Max. :8.000 Max. :35.00
hwy fl class transmission
Min. :12.00 Length:234 Length:234 Length:234
1st Qu.:18.00 Class :character Class :character Class :character
Median :24.00 Mode :character Mode :character Mode :character
Mean :23.44
3rd Qu.:27.00
Max. :44.00
2.2.2 psych::describe()
定量变量:统计量最全面 item name item number number of valid cases mean standard deviation trimmed mean (with trim defaulting to .1) median (standard or interpolated mad: median absolute deviation (from the median). minimum maximum skew kurtosis standard error 定性变量:将其转换为数值代码,再计算,结果往往没有意义。
library(psych)#批量报告统计量
describe(mpg) vars n mean sd median trimmed mad min max range
manufacturer* 1 234 7.76 5.13 6.0 7.68 5.93 1.0 15 14.0
model* 2 234 19.09 11.15 18.5 18.98 14.08 1.0 38 37.0
displ 3 234 3.47 1.29 3.3 3.39 1.33 1.6 7 5.4
year 4 234 2003.50 4.51 2003.5 2003.50 6.67 1999.0 2008 9.0
cyl 5 234 5.89 1.61 6.0 5.86 2.97 4.0 8 4.0
trans* 6 234 5.65 2.88 4.0 5.53 1.48 1.0 10 9.0
drv* 7 234 1.67 0.66 2.0 1.59 1.48 1.0 3 2.0
cty 8 234 16.86 4.26 17.0 16.61 4.45 9.0 35 26.0
hwy 9 234 23.44 5.95 24.0 23.23 7.41 12.0 44 32.0
fl* 10 234 4.63 0.70 5.0 4.77 0.00 1.0 5 4.0
class* 11 234 4.59 1.99 5.0 4.64 2.97 1.0 7 6.0
transmission* 12 234 1.33 0.47 1.0 1.29 0.00 1.0 2 1.0
skew kurtosis se
manufacturer* 0.21 -1.63 0.34
model* 0.11 -1.23 0.73
displ 0.44 -0.91 0.08
year 0.00 -2.01 0.29
cyl 0.11 -1.46 0.11
trans* 0.29 -1.65 0.19
drv* 0.48 -0.76 0.04
cty 0.79 1.43 0.28
hwy 0.36 0.14 0.39
fl* -2.25 5.76 0.05
class* -0.14 -1.52 0.13
transmission* 0.72 -1.48 0.03
#分组统计量
describeBy(mpg ~ cyl)
Descriptive statistics by group
cyl: 4
vars n mean sd median trimmed mad min max range
manufacturer* 1 81 6.30 2.54 7 6.62 2.97 1.0 9.0 8.0
model* 2 81 10.12 5.03 10 10.12 5.93 1.0 19.0 18.0
displ 3 81 2.15 0.32 2 2.14 0.30 1.6 2.7 1.1
year 4 81 2003.00 4.50 1999 2002.88 0.00 1999.0 2008.0 9.0
cyl 5 81 4.00 0.00 4 4.00 0.00 4.0 4.0 0.0
trans* 6 81 5.84 2.55 7 5.91 2.97 1.0 9.0 8.0
drv* 7 81 1.72 0.45 2 1.77 0.00 1.0 2.0 1.0
cty 8 81 21.01 3.50 21 20.62 2.97 15.0 35.0 20.0
hwy 9 81 28.80 4.52 29 28.49 2.97 20.0 44.0 24.0
fl* 10 81 3.62 0.62 4 3.72 0.00 1.0 4.0 3.0
class* 11 81 2.86 1.94 2 2.71 1.48 1.0 6.0 5.0
transmission* 12 81 1.49 0.50 1 1.49 0.00 1.0 2.0 1.0
skew kurtosis se
manufacturer* -0.81 -0.51 0.28
model* 0.00 -1.02 0.56
displ 0.07 -1.23 0.04
year 0.22 -1.98 0.50
cyl NaN NaN 0.00
trans* -0.27 -1.66 0.28
drv* -0.94 -1.13 0.05
cty 1.46 3.29 0.39
hwy 0.95 2.16 0.50
fl* -1.66 2.85 0.07
class* 0.40 -1.60 0.22
transmission* 0.02 -2.02 0.06
------------------------------------------------------------
cyl: 5
vars n mean sd median trimmed mad min max range skew
manufacturer* 1 4 1.00 0.00 1.0 1.00 0.00 1.0 1.0 0 NaN
model* 2 4 1.50 0.58 1.5 1.50 0.74 1.0 2.0 1 0.00
displ 3 4 2.50 0.00 2.5 2.50 0.00 2.5 2.5 0 NaN
year 4 4 2008.00 0.00 2008.0 2008.00 0.00 2008.0 2008.0 0 NaN
cyl 5 4 5.00 0.00 5.0 5.00 0.00 5.0 5.0 0 NaN
trans* 6 4 1.50 0.58 1.5 1.50 0.74 1.0 2.0 1 0.00
drv* 7 4 1.00 0.00 1.0 1.00 0.00 1.0 1.0 0 NaN
cty 8 4 20.50 0.58 20.5 20.50 0.74 20.0 21.0 1 0.00
hwy 9 4 28.75 0.50 29.0 28.75 0.00 28.0 29.0 1 -0.75
fl* 10 4 1.00 0.00 1.0 1.00 0.00 1.0 1.0 0 NaN
class* 11 4 1.50 0.58 1.5 1.50 0.74 1.0 2.0 1 0.00
transmission* 12 4 1.50 0.58 1.5 1.50 0.74 1.0 2.0 1 0.00
kurtosis se
manufacturer* NaN 0.00
model* -2.44 0.29
displ NaN 0.00
year NaN 0.00
cyl NaN 0.00
trans* -2.44 0.29
drv* NaN 0.00
cty -2.44 0.29
hwy -1.69 0.25
fl* NaN 0.00
class* -2.44 0.29
transmission* -2.44 0.29
------------------------------------------------------------
cyl: 6
vars n mean sd median trimmed mad min max range
manufacturer* 1 79 5.80 3.36 5.0 5.77 4.45 1.0 11.0 10.0
model* 2 79 12.75 7.31 12.0 12.66 8.90 1.0 25.0 24.0
displ 3 79 3.41 0.47 3.4 3.42 0.59 2.5 4.2 1.7
year 4 79 2002.87 4.48 1999.0 2002.74 0.00 1999.0 2008.0 9.0
cyl 5 79 6.00 0.00 6.0 6.00 0.00 6.0 6.0 0.0
trans* 6 79 4.03 2.33 3.0 3.89 1.48 1.0 8.0 7.0
drv* 7 79 1.65 0.58 2.0 1.62 0.00 1.0 3.0 2.0
cty 8 79 16.22 1.77 16.0 16.28 1.48 11.0 19.0 8.0
hwy 9 79 22.82 3.69 24.0 22.92 2.97 17.0 29.0 12.0
fl* 10 79 3.72 0.55 4.0 3.82 0.00 1.0 4.0 3.0
class* 11 79 3.29 1.79 3.0 3.25 1.48 1.0 6.0 5.0
transmission* 12 79 1.29 0.46 1.0 1.25 0.00 1.0 2.0 1.0
skew kurtosis se
manufacturer* 0.16 -1.46 0.38
model* 0.14 -1.26 0.82
displ -0.09 -1.30 0.05
year 0.28 -1.95 0.50
cyl NaN NaN 0.00
trans* 0.49 -1.49 0.26
drv* 0.21 -0.77 0.07
cty -0.38 -0.50 0.20
hwy -0.41 -1.30 0.41
fl* -2.27 6.27 0.06
class* 0.33 -1.35 0.20
transmission* 0.90 -1.20 0.05
------------------------------------------------------------
cyl: 8
vars n mean sd median trimmed mad min max range skew
manufacturer* 1 70 4.14 2.32 3.0 3.73 1.48 1 11 10 1.48
model* 2 70 10.57 5.54 10.0 10.57 7.41 1 19 18 0.09
displ 3 70 5.13 0.59 5.2 5.09 0.74 4 7 3 0.64
year 4 70 2004.53 4.41 2008.0 2004.79 0.00 1999 2008 9 -0.46
cyl 5 70 8.00 0.00 8.0 8.00 0.00 8 8 0 NaN
trans* 6 70 2.94 2.53 2.0 2.55 1.48 1 8 7 1.02
drv* 7 70 1.61 0.92 1.0 1.52 0.00 1 3 2 0.82
cty 8 70 12.57 1.81 13.0 12.57 2.22 9 16 7 -0.03
hwy 9 70 17.63 3.26 17.0 17.39 2.97 12 26 14 0.72
fl* 10 70 3.57 0.73 4.0 3.73 0.00 1 4 3 -1.55
class* 11 70 3.99 1.27 5.0 4.20 0.00 1 5 4 -0.90
transmission* 12 70 1.17 0.38 1.0 1.09 0.00 1 2 1 1.71
kurtosis se
manufacturer* 1.65 0.28
model* -1.42 0.66
displ 0.11 0.07
year -1.81 0.53
cyl NaN 0.00
trans* -0.62 0.30
drv* -1.33 0.11
cty -0.63 0.22
hwy 0.37 0.39
fl* 1.41 0.09
class* -0.31 0.15
transmission* 0.93 0.05
2.2.3 Hmisc::describe()
优点:报告定性变量的分布
library(Hmisc)describe(mpg)mpg
12 Variables 234 Observations
--------------------------------------------------------------------------------
manufacturer
n missing distinct
234 0 15
lowest : audi chevrolet dodge ford honda
highest: nissan pontiac subaru toyota volkswagen
audi (18, 0.077), chevrolet (19, 0.081), dodge (37, 0.158), ford (25, 0.107),
honda (9, 0.038), hyundai (14, 0.060), jeep (8, 0.034), land rover (4, 0.017),
lincoln (3, 0.013), mercury (4, 0.017), nissan (13, 0.056), pontiac (5, 0.021),
subaru (14, 0.060), toyota (34, 0.145), volkswagen (27, 0.115)
--------------------------------------------------------------------------------
model
n missing distinct
234 0 38
lowest : 4runner 4wd a4 a4 quattro a6 quattro altima
highest: ram 1500 pickup 4wd range rover sonata tiburon toyota tacoma 4wd
--------------------------------------------------------------------------------
displ
n missing distinct Info Mean Gmd .05 .10
234 0 35 0.997 3.472 1.471 1.8 2.0
.25 .50 .75 .90 .95
2.4 3.3 4.6 5.4 5.7
lowest : 1.6 1.8 1.9 2.0 2.2, highest: 6.0 6.1 6.2 6.5 7.0
--------------------------------------------------------------------------------
year
n missing distinct Info Mean Gmd
234 0 2 0.75 2004 4.519
Value 1999 2008
Frequency 117 117
Proportion 0.5 0.5
--------------------------------------------------------------------------------
cyl
n missing distinct Info Mean Gmd
234 0 4 0.893 5.889 1.761
Value 4 5 6 8
Frequency 81 4 79 70
Proportion 0.346 0.017 0.338 0.299
--------------------------------------------------------------------------------
trans
n missing distinct
234 0 10
lowest : auto(av) auto(l3) auto(l4) auto(l5) auto(l6)
highest: auto(s4) auto(s5) auto(s6) manual(m5) manual(m6)
Value auto(av) auto(l3) auto(l4) auto(l5) auto(l6) auto(s4)
Frequency 5 2 83 39 6 3
Proportion 0.021 0.009 0.355 0.167 0.026 0.013
Value auto(s5) auto(s6) manual(m5) manual(m6)
Frequency 3 16 58 19
Proportion 0.013 0.068 0.248 0.081
--------------------------------------------------------------------------------
drv
n missing distinct
234 0 3
Value 4 f r
Frequency 103 106 25
Proportion 0.440 0.453 0.107
--------------------------------------------------------------------------------
cty
n missing distinct Info Mean Gmd .05 .10
234 0 21 0.993 16.86 4.686 11 11
.25 .50 .75 .90 .95
14 17 19 21 24
lowest : 9 11 12 13 14, highest: 26 28 29 33 35
--------------------------------------------------------------------------------
hwy
n missing distinct Info Mean Gmd .05 .10
234 0 27 0.993 23.44 6.668 15.0 16.3
.25 .50 .75 .90 .95
18.0 24.0 27.0 30.0 32.0
lowest : 12 14 15 16 17, highest: 35 36 37 41 44
--------------------------------------------------------------------------------
fl
n missing distinct
234 0 5
lowest : c d e p r, highest: c d e p r
Value c d e p r
Frequency 1 5 8 52 168
Proportion 0.004 0.021 0.034 0.222 0.718
--------------------------------------------------------------------------------
class
n missing distinct
234 0 7
lowest : 2seater compact midsize minivan pickup
highest: midsize minivan pickup subcompact suv
Value 2seater compact midsize minivan pickup subcompact
Frequency 5 47 41 11 33 35
Proportion 0.021 0.201 0.175 0.047 0.141 0.150
Value suv
Frequency 62
Proportion 0.265
--------------------------------------------------------------------------------
transmission
n missing distinct
234 0 2
Value auto manu
Frequency 157 77
Proportion 0.671 0.329
--------------------------------------------------------------------------------
2.2.4 pastecs::stat.desc()
优点:报告了置信区间、离散系数
library(pastecs)stat.desc(mpg) manufacturer model displ year cyl trans drv
nbr.val NA NA 234.000000 2.340000e+02 234.0000000 NA NA
nbr.null NA NA 0.000000 0.000000e+00 0.0000000 NA NA
nbr.na NA NA 0.000000 0.000000e+00 0.0000000 NA NA
min NA NA 1.600000 1.999000e+03 4.0000000 NA NA
max NA NA 7.000000 2.008000e+03 8.0000000 NA NA
range NA NA 5.400000 9.000000e+00 4.0000000 NA NA
sum NA NA 812.400000 4.688190e+05 1378.0000000 NA NA
median NA NA 3.300000 2.003500e+03 6.0000000 NA NA
mean NA NA 3.471795 2.003500e+03 5.8888889 NA NA
SE.mean NA NA 0.084458 2.948048e-01 0.1053493 NA NA
CI.mean NA NA 0.166399 5.808237e-01 0.2075589 NA NA
var NA NA 1.669158 2.033691e+01 2.5970434 NA NA
std.dev NA NA 1.291959 4.509646e+00 1.6115345 NA NA
coef.var NA NA 0.372130 2.250884e-03 0.2736568 NA NA
cty hwy fl class transmission
nbr.val 234.0000000 234.0000000 NA NA NA
nbr.null 0.0000000 0.0000000 NA NA NA
nbr.na 0.0000000 0.0000000 NA NA NA
min 9.0000000 12.0000000 NA NA NA
max 35.0000000 44.0000000 NA NA NA
range 26.0000000 32.0000000 NA NA NA
sum 3945.0000000 5485.0000000 NA NA NA
median 17.0000000 24.0000000 NA NA NA
mean 16.8589744 23.4401709 NA NA NA
SE.mean 0.2782199 0.3892672 NA NA NA
CI.mean 0.5481481 0.7669333 NA NA NA
var 18.1130736 35.4577785 NA NA NA
std.dev 4.2559457 5.9546434 NA NA NA
coef.var 0.2524439 0.2540358 NA NA NA
the number of values (nbr.val) the number of null values (nbr.null) the number of missing values (nbr.na) the minimal value (min) the maximal value (max) the range (range, that is, max-min) the sum of all non-missing values (sum) the median (median) the mean (mean) the standard error on the mean (SE.mean) the confidence interval of the mean (CI.mean) at the p level the variance (var) the standard deviation (std.dev) the variation coefficient (coef.var) defined as the standard deviation divided by the mean
2.2.5 频数分布表 tbl_summary()
#加载包gtsummary
library(gtsummary)tbl_summary(mpg)| Characteristic | N = 2341 |
|---|---|
| manufacturer | |
| audi | 18 (7.7%) |
| chevrolet | 19 (8.1%) |
| dodge | 37 (16%) |
| ford | 25 (11%) |
| honda | 9 (3.8%) |
| hyundai | 14 (6.0%) |
| jeep | 8 (3.4%) |
| land rover | 4 (1.7%) |
| lincoln | 3 (1.3%) |
| mercury | 4 (1.7%) |
| nissan | 13 (5.6%) |
| pontiac | 5 (2.1%) |
| subaru | 14 (6.0%) |
| toyota | 34 (15%) |
| volkswagen | 27 (12%) |
| model | |
| 4runner 4wd | 6 (2.6%) |
| a4 | 7 (3.0%) |
| a4 quattro | 8 (3.4%) |
| a6 quattro | 3 (1.3%) |
| altima | 6 (2.6%) |
| c1500 suburban 2wd | 5 (2.1%) |
| camry | 7 (3.0%) |
| camry solara | 7 (3.0%) |
| caravan 2wd | 11 (4.7%) |
| civic | 9 (3.8%) |
| corolla | 5 (2.1%) |
| corvette | 5 (2.1%) |
| dakota pickup 4wd | 9 (3.8%) |
| durango 4wd | 7 (3.0%) |
| expedition 2wd | 3 (1.3%) |
| explorer 4wd | 6 (2.6%) |
| f150 pickup 4wd | 7 (3.0%) |
| forester awd | 6 (2.6%) |
| grand cherokee 4wd | 8 (3.4%) |
| grand prix | 5 (2.1%) |
| gti | 5 (2.1%) |
| impreza awd | 8 (3.4%) |
| jetta | 9 (3.8%) |
| k1500 tahoe 4wd | 4 (1.7%) |
| land cruiser wagon 4wd | 2 (0.9%) |
| malibu | 5 (2.1%) |
| maxima | 3 (1.3%) |
| mountaineer 4wd | 4 (1.7%) |
| mustang | 9 (3.8%) |
| navigator 2wd | 3 (1.3%) |
| new beetle | 6 (2.6%) |
| passat | 7 (3.0%) |
| pathfinder 4wd | 4 (1.7%) |
| ram 1500 pickup 4wd | 10 (4.3%) |
| range rover | 4 (1.7%) |
| sonata | 7 (3.0%) |
| tiburon | 7 (3.0%) |
| toyota tacoma 4wd | 7 (3.0%) |
| displ | 3.30 (2.40, 4.60) |
| year | |
| 1999 | 117 (50%) |
| 2008 | 117 (50%) |
| cyl | |
| 4 | 81 (35%) |
| 5 | 4 (1.7%) |
| 6 | 79 (34%) |
| 8 | 70 (30%) |
| trans | |
| auto(av) | 5 (2.1%) |
| auto(l3) | 2 (0.9%) |
| auto(l4) | 83 (35%) |
| auto(l5) | 39 (17%) |
| auto(l6) | 6 (2.6%) |
| auto(s4) | 3 (1.3%) |
| auto(s5) | 3 (1.3%) |
| auto(s6) | 16 (6.8%) |
| manual(m5) | 58 (25%) |
| manual(m6) | 19 (8.1%) |
| drv | |
| 4 | 103 (44%) |
| f | 106 (45%) |
| r | 25 (11%) |
| cty | 17 (14, 19) |
| hwy | 24 (18, 27) |
| fl | |
| c | 1 (0.4%) |
| d | 5 (2.1%) |
| e | 8 (3.4%) |
| p | 52 (22%) |
| r | 168 (72%) |
| class | |
| 2seater | 5 (2.1%) |
| compact | 47 (20%) |
| midsize | 41 (18%) |
| minivan | 11 (4.7%) |
| pickup | 33 (14%) |
| subcompact | 35 (15%) |
| suv | 62 (26%) |
| transmission | |
| auto | 157 (67%) |
| manu | 77 (33%) |
| 1 n (%); Median (IQR) | |
2.2.6 学术论文表格 tbl_summary()
library(kableExtra)data <- mpg %>%
select(displ, hwy,cty)
#data %>%
# kable(col.names = c("displacement","highway mpg","city mpg")) %>%
# scroll_box(width = "500px", height = "200px")
table1 <- data.frame(Mean = colMeans(data),
SD = sapply(data, sd),
Median = sapply(data, median),
Min = sapply(data, min),
Max = sapply(data, max))
table1 Mean SD Median Min Max
displ 3.471795 1.291959 3.3 1.6 7
hwy 23.440171 5.954643 24.0 12.0 44
cty 16.858974 4.255946 17.0 9.0 35
#kbl()设置表格格式
#修改列名col.names = c("Mean", "SD","Median", "Minimum", "Maximum")
#设置小数位数digits = 3
#设置对齐方式 align = "c"
table1 %>%
kbl(col.names = c("Mean", "SD","Median", "Minimum", "Maximum"),
digits = 3,
align = "c") %>%
kable_styling()| Mean | SD | Median | Minimum | Maximum | |
|---|---|---|---|---|---|
| displ | 3.472 | 1.292 | 3.3 | 1.6 | 7 |
| hwy | 23.440 | 5.955 | 24.0 | 12.0 | 44 |
| cty | 16.859 | 4.256 | 17.0 | 9.0 | 35 |
table1 Mean SD Median Min Max
displ 3.471795 1.291959 3.3 1.6 7
hwy 23.440171 5.954643 24.0 12.0 44
cty 16.858974 4.255946 17.0 9.0 35
#占位符.
#管道符特殊用法:不将data传导至函数的第一项参数,将函数用{}括起来
table2 <- data %>%
{data.frame(Mean = colMeans(.),
SD = sapply(., sd),
Median = sapply(., median),
Min = sapply(., min),
Max = sapply(., max))}
table2 Mean SD Median Min Max
displ 3.471795 1.291959 3.3 1.6 7
hwy 23.440171 5.954643 24.0 12.0 44
cty 16.858974 4.255946 17.0 9.0 35
3 相关分析
3.1 相关系数 cor()
cor(mpg$displ, mpg$cty)[1] -0.798524
mpg %>%
select(displ, cty, hwy) %>%
cor() %>%
round(digits = 3) displ cty hwy
displ 1.000 -0.799 -0.766
cty -0.799 1.000 0.956
hwy -0.766 0.956 1.000
3.2 相关系数显著性检验
3.2.1 cor.test()
cor.test(mpg$displ, mpg$hwy)
Pearson's product-moment correlation
data: mpg$displ and mpg$hwy
t = -18.151, df = 232, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.8142727 -0.7072539
sample estimates:
cor
-0.76602
cor.test(mpg$displ, mpg$cty)
Pearson's product-moment correlation
data: mpg$displ and mpg$cty
t = -20.205, df = 232, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.8406782 -0.7467508
sample estimates:
cor
-0.798524
3.2.2 Hmisc::rcorr()
library(Hmisc)
correl.matrix <- data %>%
as.matrix() %>%
rcorr()
correl.matrix displ hwy cty
displ 1.00 -0.77 -0.80
hwy -0.77 1.00 0.96
cty -0.80 0.96 1.00
n= 234
P
displ hwy cty
displ 0 0
hwy 0 0
cty 0 0
attributes(correl.matrix)$names
[1] "r" "n" "P"
$class
[1] "rcorr"
round(correl.matrix$r, digits = 3) displ hwy cty
displ 1.000 -0.766 -0.799
hwy -0.766 1.000 0.956
cty -0.799 0.956 1.000
round(correl.matrix$P, digits = 3) displ hwy cty
displ NA 0 0
hwy 0 NA 0
cty 0 0 NA
3.3 相关系数可视化
3.3.1 corrpolot::corrplot()
library(corrplot)data %>%
cor() %>%
corrplot()data %>%
cor() %>%
corrplot(method = "square",
type = "upper")data %>%
cor() %>%
corrplot(method = "number",
type = "lower")3.3.2 ggstatsplot::ggscatterstats
library(ggstatsplot)
library(tidyverse)data %>%
ggscatterstats(x = displ,
y = hwy,
bf.message = FALSE
)4 本章习题
数据文件:mtcars
1.查看mtcars的帮助文件,简要解释每个变量的含义。
2.用图形展示mpg的分布。(提示:直方图、箱线图)
3.将汽车按vs分组,用图形比较两个组别的汽车mpg的分布。(提示:分组直方图、分组箱线图)
4.将汽车按am分组,用图形比较两个组别的汽车mpg的分布。(提示:分组直方图、分组箱线图)
5.将汽车按vs车分组,分别绘制两个组别wt的箱线图。
6.用图形展示vs的分布。(提示:条形图)
7.用图形展示am的分布。(提示:条形图)
8.用图形展示vs、am的分布。(提示:分组条形图、分组堆栈条形图)
9.用图形展示mpg和wt的关系。(提示:散点图)
10.用图形展示mpg和disp的关系。(提示:散点图)
11.用图形展示mpg,disp,hp两两之间的关系。(提示:矩阵散点图)
12.将汽车按am分成两组,绘制mpg和disp的分组散点图。
13.将汽车按vs分成两组,绘制mpg和wt的分组散点图。
14.报告定量变量的均值、中位数、标准差、最大值和最小值。
- 报告mpg, disp,hp,wt, drat五个变量两两之间的相关系数,并用可视化工具呈现其相关系数。(提示:corrpolot::corrplot())
答题要求:
标好题号,将代码和输出图形/结果复制导word文档,再word文档转换成图片在91速课平台提交。