install.packages("ggplot2")
install.packages("tidyverse")
install.packages("MASS")
install.packages("klaR")
install.packages("devtools")
install.packages("psych")
install.packages("MVN")
install.packages("biotools")十项全能运动员判别分析
安装包
加载包
library(ggplot2)
library(tidyverse)
library(psych)
library(biotools)
library(MVN)数据文件
导入数据
library(readxl)
decathlon <- read_excel("decathlon.xlsx",
col_types = c("numeric", "text", "numeric",
"numeric", rep("text", 3), rep("numeric", 3), "text",
rep("numeric", 11)))创建分类变量
library(tidyverse)
library(DescTools)
decathlon <- decathlon %>% mutate(tier = case_when(
rank %[]% c(1,33) ~ "first",
rank %[]% c(34,66) ~ "second",
rank %[]% c(67,100) ~ "third",
))多元正态检验
decathlon %>%
filter(tier == "first") %>%
dplyr::select(M100:javelin_throw) %>%
MVN::mardia() Test Statistic p.value Method
1 Mardia Skewness 182.182632 0.97033157 asymptotic
2 Mardia Kurtosis -1.844624 0.06509221 asymptotic
decathlon %>%
filter(tier == "second") %>%
dplyr::select(M100:javelin_throw) %>%
MVN::mardia() Test Statistic p.value Method
1 Mardia Skewness 260.62023080 0.03144084 asymptotic
2 Mardia Kurtosis 0.03828022 0.96946426 asymptotic
decathlon %>%
filter(tier == "third") %>%
dplyr::select(M100:javelin_throw) %>%
MVN::mardia() Test Statistic p.value Method
1 Mardia Skewness 226.3857853 0.3694740 asymptotic
2 Mardia Kurtosis -0.2327665 0.8159428 asymptotic
检验协方差矩阵是否相等
df <- decathlon %>%
dplyr::select(M100:tier) %>%
mutate(tier = as.factor(tier)) %>%
as.data.frame()
boxM(df[, -11], df[, 11])
Box's M-test for Homogeneity of Covariance Matrices
data: df[, -11]
Chi-Sq (approx.) = 169.13, df = 110, p-value = 0.0002509
建立判别函数
library(MASS)
model <- lda(tier ~ M100 + Hurdles_M100 + M400 + M1500 +
long_jump + high_jump + pole_vault +
shot_put + discus_thow + javelin_throw, decathlon)
modelCall:
lda(tier ~ M100 + Hurdles_M100 + M400 + M1500 + long_jump + high_jump +
pole_vault + shot_put + discus_thow + javelin_throw, data = decathlon)
Prior probabilities of groups:
first second third
0.33 0.33 0.34
Group means:
M100 Hurdles_M100 M400 M1500 long_jump high_jump pole_vault
first 10.82909 14.51606 48.66818 277.9918 7.452424 2.006970 4.924242
second 11.05091 14.76273 49.88515 280.3267 7.202727 1.968182 4.668485
third 11.04735 14.72765 50.16412 278.9579 7.139706 1.946471 4.660882
shot_put discus_thow javelin_throw
first 14.48636 44.79182 59.49242
second 13.97667 43.39879 58.46515
third 13.78294 41.44118 54.27412
Coefficients of linear discriminants:
LD1 LD2
M100 1.18999529 -0.42663574
Hurdles_M100 0.35551226 -0.88005615
M400 0.81924845 0.01025378
M1500 0.01210876 -0.01730195
long_jump -1.76643422 -0.67067804
high_jump -6.92011101 -1.04246407
pole_vault -2.98252729 2.02264320
shot_put -0.36594939 0.22228646
discus_thow -0.18572931 -0.06791697
javelin_throw -0.07878029 -0.12571473
Proportion of trace:
LD1 LD2
0.9758 0.0242
预测样本中个案的类别
model.predict <- predict(model, decathlon)
decathlon$predict <- model.predict$class评估预测效果
decathlon$ld1 <- model.predict$x[,1]
decathlon$ld2 <- model.predict$x[,2]
# 对比个案的观测类别的预测类别
table(decathlon$tier, decathlon$predict)
first second third
first 32 1 0
second 0 30 3
third 0 6 28
# 计算预测正确率
mean(decathlon$tier == decathlon$predict)[1] 0.9
可视化
判别函数得分直方图
ldahist(model.predict$x[,1], model.predict$class)
ldahist(model.predict$x[,2], model.predict$class)
标记错误的个案
decathlon$right <- decathlon$tier == decathlon$predict
wrong_cases <- decathlon %>%
filter(decathlon$right == FALSE)
wrong_cases# A tibble: 10 × 27
rank COMPETITOR `year of birth` Age Nationality country Continent
<dbl> <chr> <dbl> <dbl> <chr> <chr> <chr>
1 32 Martin ROE 1992 27 NOR Norway Europe
2 54 Akihiko NAKAMURA 1990 29 JPN Japan Asia
3 60 Ludovic BESSON 1998 21 FRA France Europe
4 65 Makenson GLETTY 1999 20 FRA France Europe
5 70 John LANE 1989 30 GBR United … Europe
6 71 Rik TAAM 1997 22 NED netherl… Europe
7 72 Nick GUERRANT 1999 20 USA United … North am…
8 82 Kazuya KAWASAKI 1992 27 JPN Japan Asia
9 85 Elmo SAVOLA 1995 24 FIN Finland Europe
10 92 Aleksandar GRNOVIC 1996 23 SRB Serbia Europe
# ℹ 20 more variables: Continent_code <dbl>, continent_G3 <dbl>,
# continent_G4 <dbl>, VENUE <chr>, MARK <dbl>, M100 <dbl>,
# Hurdles_M100 <dbl>, M400 <dbl>, M1500 <dbl>, long_jump <dbl>,
# high_jump <dbl>, pole_vault <dbl>, shot_put <dbl>, discus_thow <dbl>,
# javelin_throw <dbl>, tier <chr>, predict <fct>, ld1 <dbl>, ld2 <dbl>,
# right <lgl>
decathlon %>% ggplot(aes(ld1, ld2, col = tier))+
geom_point()+
geom_point(data = wrong_cases,
aes(ld1, ld2),
col = "red", alpha = 0.6)+
ylim(-5,5)+
xlim(-5,5)
判别函数得分散点图,按类别着色
library(ggord)
ggord(model, decathlon$tier)
预测新个案
new_case <- data.frame(
M100 = 10.73,
Hurdles_M100 = 15.34,
M400 = 47.93,
M1500 = 288.58,
long_jump = 7.58,
high_jump = 2.06,
pole_vault = 5.10,
shot_put = 14.28,
discus_thow = 36.93,
javelin_throw = 61.43
)
predict(model, new_case)$class
[1] first
Levels: first second third
$posterior
first second third
1 0.9967666 0.003214922 1.845589e-05
$x
LD1 LD2
1 -2.719559 -0.3001785
new_pred <- predict(model, new_case)
new_scores <- new_pred$x # 提取LD1和LD2得分
new_class <- new_pred$class # 提取预测类别在判别函数得分散点图中标记新个案
# 绘制基本ggord图
ggord(model, decathlon$tier) +
geom_point(data = data.frame(LD1 = new_scores[,1],
LD2 = new_scores[,2],
tier = new_class),
aes(x = LD1, y = LD2),
size = 5, shape = 17, color = "cyan") # 用大三角形标记新点

点击下载数据文件: decathlon.xlsx