本章介绍R中的对应分析。
# 安装包
install.packages("tidyverse")
install.packages("FactoMineR")
install.packages("factoextra")
install.packages("MASS")
install.packages("gplots")
install.packages("corrplot")
install.packages("RColorBrewer")
# 加载包
library(tidyverse)
library(FactoMineR)
library(factoextra)
library(MASS)
library(gplots)
#导入数据,修改行名,列名
library(MASS)
data(caith)
rownames(caith) <- c("eye.blue", "eye.light", "eye.medium", "eye.dark")
colnames(caith) <- c("hair.fair", "hair.red", "hair.medium",
"hair.dark", "hair.black")
1 第1步 独立性检验、列联表可视化
chisq.test(caith)
Pearson's Chi-squared test
data: caith
X-squared = 1240, df = 12, p-value < 2.2e-16
mosaicplot(caith,
color = TRUE,
cex.axis = 0.8,
border = 5,
main = "Eye Color and Hair Color")
library(tidyverse)
library(gplots)
%>%
caith as.matrix() %>%
as.table() %>%
t() %>%
balloonplot(main ="Eye Color and Hair Color",
label = FALSE,
show.margins = FALSE)
2 第2步 计算Compute CA
library(FactoMineR)
<- CA(caith, graph = FALSE)
res.ca
%>% summary() res.ca
Call:
CA(X = caith, graph = FALSE)
The chi square of independence between the two variables is equal to 1240.039 (p-value = 4.123993e-258 ).
Eigenvalues
Dim.1 Dim.2 Dim.3
Variance 0.199 0.030 0.001
% of var. 86.556 13.070 0.373
Cumulative % of var. 86.556 99.627 100.000
Rows
Iner*1000 Dim.1 ctr cos2 Dim.2 ctr cos2
eye.blue | 25.553 | -0.400 10.719 0.836 | 0.165 12.121 0.143 |
eye.light | 59.557 | -0.441 28.591 0.956 | 0.088 7.629 0.039 |
eye.medium | 20.149 | 0.034 0.187 0.018 | -0.245 65.701 0.981 |
eye.dark | 124.932 | 0.703 60.503 0.965 | 0.134 14.550 0.035 |
Dim.3 ctr cos2
eye.blue -0.064 63.832 0.021 |
eye.light 0.032 34.451 0.005 |
eye.medium -0.006 1.181 0.001 |
eye.dark 0.004 0.536 0.000 |
Columns
Iner*1000 Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3
hair.fair | 88.134 | -0.544 40.116 0.907 | 0.174 27.131 0.093 | -0.013
hair.red | 3.752 | -0.233 1.450 0.770 | 0.048 0.411 0.033 | 0.118
hair.medium | 17.918 | -0.042 0.352 0.039 | -0.208 57.211 0.961 | -0.003
hair.dark | 92.308 | 0.589 44.915 0.969 | 0.104 9.274 0.030 | -0.010
hair.black | 28.079 | 1.094 13.167 0.934 | 0.286 5.973 0.064 | 0.046
ctr cos2
hair.fair 4.928 0.000 |
hair.red 86.090 0.197 |
hair.medium 0.483 0.000 |
hair.dark 3.075 0.000 |
hair.black 5.425 0.002 |
res.ca
**Results of the Correspondence Analysis (CA)**
The row variable has 4 categories; the column variable has 5 categories
The chi square of independence between the two variables is equal to 1240.039 (p-value = 4.123993e-258 ).
*The results are available in the following objects:
name description
1 "$eig" "eigenvalues"
2 "$col" "results for the columns"
3 "$col$coord" "coord. for the columns"
4 "$col$cos2" "cos2 for the columns"
5 "$col$contrib" "contributions of the columns"
6 "$row" "results for the rows"
7 "$row$coord" "coord. for the rows"
8 "$row$cos2" "cos2 for the rows"
9 "$row$contrib" "contributions of the rows"
10 "$call" "summary called parameters"
11 "$call$marge.col" "weights of the columns"
12 "$call$marge.row" "weights of the rows"
#查看特征值、维度贡献率、累积贡献率
$eig res.ca
eigenvalue percentage of variance cumulative percentage of variance
dim 1 0.1992447520 86.5562709 86.55627
dim 2 0.0300867741 13.0703516 99.62662
dim 3 0.0008594814 0.3733775 100.00000
#查看过渡矩阵Z的奇异值分解
$svd res.ca
$vs
[1] 0.44636840 0.17345540 0.02931691
$U
[,1] [,2] [,3]
[1,] -0.89679252 0.9536227 -2.1884132
[2,] -0.98731818 0.5100045 1.0837859
[3,] 0.07530627 -1.4124778 -0.1894089
[4,] 1.57434710 0.7720361 0.1482208
$V
[,1] [,2] [,3]
[1,] -1.21871379 1.0022432 -0.4271282
[2,] -0.52257500 0.2783364 4.0268545
[3,] -0.09414671 -1.2009094 -0.1103959
[4,] 1.31888486 0.5992920 -0.3450676
[5,] 2.45176017 1.6513565 1.5736976
#查看行变量的坐标、贡献、cos2(the quality of each dimension for each point)
$row res.ca
$coord
Dim 1 Dim 2 Dim 3
eye.blue -0.40029985 0.16541100 -0.064157519
eye.light -0.44070764 0.08846303 0.031773257
eye.medium 0.03361434 -0.24500190 -0.005552885
eye.dark 0.70273880 0.13391383 0.004345377
$contrib
Dim 1 Dim 2 Dim 3
eye.blue 10.7191765 12.120781 63.831659
eye.light 28.5906733 7.628833 34.450625
eye.medium 0.1867536 65.700687 1.181429
eye.dark 60.5033967 14.549698 0.536287
$cos2
Dim 1 Dim 2 Dim 3
eye.blue 0.83581532 0.14271455 2.147013e-02
eye.light 0.95648915 0.03853918 4.971671e-03
eye.medium 0.01846682 0.98102924 5.039418e-04
eye.dark 0.96492376 0.03503934 3.689437e-05
$inertia
[1] 0.02555277 0.05955678 0.02014947 0.12493199
## calculate 第1个行变量的contribution
#计算对应矩阵correspondence matrix ——P
<- as.matrix(caith/sum(caith))
P %>% round(3) P
hair.fair hair.red hair.medium hair.dark hair.black
eye.blue 0.061 0.007 0.045 0.020 0.001
eye.light 0.128 0.022 0.108 0.035 0.001
eye.medium 0.064 0.016 0.169 0.076 0.005
eye.dark 0.018 0.009 0.075 0.126 0.016
#计算边缘概率、独立性假定下的期望概率
<- rowSums(P)
pi. pi.
eye.blue eye.light eye.medium eye.dark
0.1332838 0.2932987 0.3293113 0.2441062
<- colSums(P)
pj. pj.
hair.fair hair.red hair.medium hair.dark hair.black
0.27009467 0.05309077 0.39669575 0.25821422 0.02190459
$row$coord[,1]^2 * pi.)/sum(res.ca$row$coord[,1]^2 * pi.) (res.ca
eye.blue eye.light eye.medium eye.dark
0.107191765 0.285906733 0.001867536 0.605033967
## calculate 第1个行变量的cos2
$row$coord[1,]^2 )/sum(res.ca$row$coord[1,]^2) (res.ca
Dim 1 Dim 2 Dim 3
0.83581532 0.14271455 0.02147013
#查看列变量的坐标、贡献、cos2(the quality of each dimension for each point)
$col res.ca
$coord
Dim 1 Dim 2 Dim 3
hair.fair -0.54399533 0.17384449 -0.012522082
hair.red -0.23326097 0.04827895 0.118054940
hair.medium -0.04202412 -0.20830421 -0.003236468
hair.dark 0.58870853 0.10395044 -0.010116315
hair.black 1.09438828 0.28643670 0.046135954
$contrib
Dim 1 Dim 2 Dim 3
hair.fair 40.1161707 27.1307783 4.9275678
hair.red 1.4498275 0.4113003 86.0896470
hair.medium 0.3516154 57.2108003 0.4834635
hair.dark 44.9152601 9.2737881 3.0745985
hair.black 13.1671264 5.9733330 5.4247232
$cos2
Dim 1 Dim 2 Dim 3
hair.fair 0.9069022 0.09261727 0.0004805329
hair.red 0.7698335 0.03297830 0.1971882209
hair.medium 0.0390998 0.96066829 0.0002319103
hair.dark 0.9694868 0.03022692 0.0002862767
hair.black 0.9343341 0.06400541 0.0016604979
$inertia
[1] 0.088134492 0.003752377 0.017917615 0.092307908 0.028078616
## calculate 第1个列变量的cos2
$col$coord[1,]^2/sum(res.ca$col$coord[1,]^2) res.ca
Dim 1 Dim 2 Dim 3
0.9069021982 0.0926172689 0.0004805329
3 第3步 可视化Visualization
3.1 碎石图scree plot
#绘制scree plot
library(factoextra)
fviz_screeplot(res.ca, addlabels = TRUE, ylim = c(0, 100))
# symmetric plot
library(factoextra)
fviz_ca_biplot(res.ca, repel = TRUE)
dimdesc(res.ca)
$`Dim 1`
$`Dim 1`$row
coord
eye.light -0.44070764
eye.blue -0.40029985
eye.medium 0.03361434
eye.dark 0.70273880
$`Dim 1`$col
coord
hair.fair -0.54399533
hair.red -0.23326097
hair.medium -0.04202412
hair.dark 0.58870853
hair.black 1.09438828
$`Dim 2`
$`Dim 2`$row
coord
eye.medium -0.24500190
eye.light 0.08846303
eye.dark 0.13391383
eye.blue 0.16541100
$`Dim 2`$col
coord
hair.medium -0.20830421
hair.red 0.04827895
hair.dark 0.10395044
hair.fair 0.17384449
hair.black 0.28643670
#Rows are represented by blue points and columns by red triangles.
3.2 行变量绘图 Graph of row variables
<- get_ca_row(res.ca)
row row
Correspondence Analysis - Results for rows
===================================================
Name Description
1 "$coord" "Coordinates for the rows"
2 "$cos2" "Cos2 for the rows"
3 "$contrib" "contributions of the rows"
4 "$inertia" "Inertia of the rows"
row$coord: coordinates of each row point in each dimension (1, 2 and 3). Used to create the scatter plot.
row$cos2: quality of representation of rows.
var$contrib: contribution of rows (in %) to the definition of the dimensions.
#Coordinates of row points
$coord[,1:2] row
Dim 1 Dim 2
eye.blue -0.40029985 0.16541100
eye.light -0.44070764 0.08846303
eye.medium 0.03361434 -0.24500190
eye.dark 0.70273880 0.13391383
fviz_ca_row(res.ca, repel = TRUE)
fviz_ca_row(res.ca, col.row="steelblue", shape.row = 15, repel = TRUE)
#Quality of representation of rows
#The quality of representation of the rows on the factor map is called the squared cosine (cos2) or the squared correlations.
$cos2 row
Dim 1 Dim 2 Dim 3
eye.blue 0.83581532 0.14271455 2.147013e-02
eye.light 0.95648915 0.03853918 4.971671e-03
eye.medium 0.01846682 0.98102924 5.039418e-04
eye.dark 0.96492376 0.03503934 3.689437e-05
# Color by cos2 values: quality on the factor map
fviz_ca_row(res.ca, col.row = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE)
# Change the transparency by cos2 values
fviz_ca_row(res.ca, alpha.row="cos2")
#visualize the cos2 of row points on all the dimensions
library(corrplot)
corrplot(row$cos2, is.corr=FALSE)
# Cos2 of rows on Dim.1
fviz_cos2(res.ca, choice = "row", axes = 1)
# Cos2 of rows on Dim.2
fviz_cos2(res.ca, choice = "row", axes = 2)
# Cos2 of rows on Dim.1 and Dim.2
fviz_cos2(res.ca, choice = "row", axes = 1:2)
#Coordinates of row points
$contrib row
Dim 1 Dim 2 Dim 3
eye.blue 10.7191765 12.120781 63.831659
eye.light 28.5906733 7.628833 34.450625
eye.medium 0.1867536 65.700687 1.181429
eye.dark 60.5033967 14.549698 0.536287
# Contributions of rows to dimension 1
fviz_contrib(res.ca, choice = "row", axes = 1)
# Contributions of rows to dimension 2
fviz_contrib(res.ca, choice = "row", axes = 2, top = 2)
#The most important (or, contributing) row points can be highlighted on the scatter plot as follow:
fviz_ca_row(res.ca, col.row = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE)
3.3 列变量绘图 Graph of column variables
fviz_ca_col(res.ca, col.col = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE)
3.4 筛选画图变量Filter Results
# Visualize rows with cos2 >= 0.9
fviz_ca_row(res.ca, select.row = list(cos2 = 0.9), repel = T)
# Top 3 active rows with the highest cos2
fviz_ca_row(res.ca, select.row = list(cos2 = 3), repel = T)
# Select by names
<- list(name = c("eye.dark", "eye.medium"))
name fviz_ca_row(res.ca, select.row = name, repel = T)
# Top 3 contributing rows and columns
fviz_ca_biplot(res.ca, select.row = list(contrib = 3),
select.col = list(contrib = 3), repel = T) +
theme_minimal()
4 应用举例: Nobel Prize
library(readxl)
#读取数据,将其保存为data frame
<- read_excel("nobel_prize.xlsx") %>%
nobel_prize data.frame()
#为nobel_prize添加行名,才能在图中显示出行名标签
rownames(nobel_prize) <- nobel_prize$Country
nobel_prize
Country Chemistry Economics Literature Medicine Peace Physics
Canada Canada 4 3 2 4 1 4
France France 8 3 11 12 10 9
Germany Germany 24 1 8 18 5 24
Italy Italy 1 1 6 5 1 5
Japan Japan 6 0 2 3 1 11
Russia Russia 4 3 5 2 3 10
UK UK 23 6 7 26 11 20
USA USA 51 43 8 70 19 66
#转换nobel_prize数据结构
library(tidyverse)
<- nobel_prize %>%
nobel_long pivot_longer(c("Chemistry","Economics","Literature",
"Literature", "Medicine", "Peace", "Physics"),
names_to = "field",
values_to = "number")
#查看色卡
library(RColorBrewer)
::display.brewer.all() RColorBrewer
#绘制堆栈条形图,先按国家分,再按领域分
%>%
nobel_long ggplot(aes(Country,number,fill = field))+
geom_col(position = "fill")+
theme_bw()+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())+
labs(title = "Nobel Prize by country and Field",
x = "Country",
y = "Number")+
scale_fill_brewer(palette = "Accent")+
coord_flip()+
theme_minimal()
#绘制堆栈条形图,先按领域分,再国家分
%>%
nobel_long ggplot(aes(field,number,fill = Country))+
geom_col(position = "fill")+
theme_bw()+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())+
labs(title = "Nobel Prize by Field and Country",
x = "Field",
y = "Number")+
scale_fill_brewer(palette = "Spectral")+
theme_minimal()
#计算CA
<- CA(nobel_prize[-1]) res.ca
#查看CA详细结果
summary(res.ca)
Call:
CA(X = nobel_prize[-1])
The chi square of independence between the two variables is equal to 86.75919 (p-value = 2.76733e-06 ).
Eigenvalues
Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
Variance 0.083 0.037 0.022 0.008 0.002
% of var. 54.748 24.600 14.227 5.180 1.245
Cumulative % of var. 54.748 79.348 93.575 98.755 100.000
Rows
Iner*1000 Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3
Canada | 2.040 | -0.047 0.085 0.035 | 0.113 1.086 0.199 | 0.112
France | 33.080 | 0.498 27.709 0.698 | 0.273 18.536 0.210 | -0.155
Germany | 19.066 | 0.158 4.226 0.185 | -0.320 38.296 0.752 | -0.061
Italy | 25.908 | 0.713 20.316 0.653 | 0.265 6.261 0.090 | 0.282
Japan | 15.089 | 0.176 1.492 0.082 | -0.516 28.653 0.711 | 0.244
Russia | 14.432 | 0.356 7.210 0.416 | 0.049 0.298 0.008 | 0.355
UK | 7.354 | 0.036 0.249 0.028 | -0.041 0.727 0.037 | -0.205
USA | 35.239 | -0.267 38.713 0.915 | 0.071 6.143 0.065 | 0.037
ctr cos2
Canada 1.834 0.195 |
France 10.375 0.068 |
Germany 2.431 0.028 |
Italy 12.251 0.102 |
Japan 11.089 0.159 |
Russia 27.589 0.414 |
UK 31.534 0.929 |
USA 2.897 0.018 |
Columns
Iner*1000 Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3
Chemistry | 13.469 | -0.058 0.862 0.053 | -0.212 25.471 0.708 | -0.107
Economics | 38.902 | -0.462 26.916 0.577 | 0.351 34.675 0.334 | 0.174
Literature | 58.438 | 0.790 64.353 0.918 | 0.186 7.942 0.051 | 0.117
Medicine | 10.429 | -0.108 3.418 0.273 | 0.066 2.874 0.103 | -0.115
Peace | 16.213 | 0.203 4.443 0.228 | 0.208 10.316 0.238 | -0.226
Physics | 14.758 | 0.005 0.008 0.000 | -0.164 18.723 0.475 | 0.164
ctr cos2
Chemistry 11.181 0.180 |
Economics 14.649 0.082 |
Literature 5.478 0.020 |
Medicine 15.069 0.313 |
Peace 21.169 0.283 |
Physics 32.455 0.476 |
#绘制scree plot
library(factoextra)
fviz_screeplot(res.ca, addlabels = TRUE, ylim = c(0, 75))#Coordinates of row points
# 绘制coordinate plot
$row$coord[,1:2] res.ca
Dim 1 Dim 2
Canada -0.04723540 0.11346322
France 0.49832819 0.27320639
Germany 0.15840265 -0.31963720
Italy 0.71266566 0.26520715
Japan 0.17551152 -0.51563474
Russia 0.35614910 0.04856816
UK 0.03567203 -0.04083529
USA -0.26748757 0.07142297
fviz_ca_row(res.ca, col.row="steelblue", shape.row = 15, repel = TRUE)
$col$coord[,1:2] res.ca
Dim 1 Dim 2
Chemistry -0.058167033 -0.21195905
Economics -0.461605434 0.35119852
Literature 0.789819984 0.18598649
Medicine -0.107691667 0.06618674
Peace 0.203420919 0.20777666
Physics 0.004937747 -0.16376482
fviz_ca_col(res.ca, col.row="steelblue", shape.row = 15, repel = TRUE)
#绘制cos2 plot
$cos2 row
Dim 1 Dim 2 Dim 3
eye.blue 0.83581532 0.14271455 2.147013e-02
eye.light 0.95648915 0.03853918 4.971671e-03
eye.medium 0.01846682 0.98102924 5.039418e-04
eye.dark 0.96492376 0.03503934 3.689437e-05
# Color by cos2 values: quality on the factor map
fviz_ca_row(res.ca, col.row = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE)
library(corrplot)
corrplot(row$cos2, is.corr=FALSE,
col=colorRampPalette(c("blue","white","green"))(100))
#绘制contrib plot
$row$contrib res.ca
Dim 1 Dim 2 Dim 3 Dim 4 Dim 5
Canada 0.08455223 1.0857665 1.833533 1.94008269 53.437029
France 27.70921162 18.5357993 10.375419 10.10834330 0.270120
Germany 4.22602694 38.2964354 2.431166 5.76383493 11.839516
Italy 20.31620837 6.2614912 12.251198 48.99181647 6.243865
Japan 1.49161410 28.6527237 11.089113 4.49241164 19.121633
Russia 7.21015937 0.2984151 27.588955 28.48923613 4.835537
UK 0.24914716 0.7266209 31.533847 0.07983836 2.064111
USA 38.71308022 6.1427479 2.896768 0.13443649 2.188190
fviz_ca_row(res.ca, col.row = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE)
fviz_ca_col(res.ca, col.col = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE)
#筛选满足条件的组别
# Visualize rows with cos2 >= 0.8
fviz_ca_row(res.ca, select.row = list(cos2 = 0.8), repel = T)
# Visualize columns with cos2 >= 0.8
fviz_ca_col(res.ca, select.col = list(cos2 = 0.8), repel = T)
# Top 3 contributing rows and columns
fviz_ca_biplot(res.ca, select.row = list(contrib = 3),
select.col = list(contrib = 3), repel = T) +
theme_minimal()
5 本章作业
答题要求:将R的命令和输出结果转成图片,上传至91速课平台。
习题1: 教材P165,例题8.1
要求: 1.1 对行变量和列变量进行独立性检验,独立性检验的卡方检验统计量是多少?P值是什么,你的检验结论是什么?
1.2 绘制碎石图。你提取几个维度?每个维度的贡献分别是多少?
1.3 绘制对应分析图。
1.4 绘制行变量的contribution、cos2的图像。
1.5 绘制列变量的contribution、cos2的图像。列变量的哪几个组别是相对重要的组别?
1.6 从图中可以发现哪些组别关系紧密?
习题2
提示:数据请自行录入,可在EXCEL录入好后,再导入R。
要求: 2.1 对行变量和列变量进行独立性检验,独立性检验的卡方检验统计量是多少?P值是什么,你的检验结论是什么?
2.2 绘制碎石图。你提取几个维度?每个维度的贡献分别是多少?
2.3 绘制对应分析图。
2.4 绘制行变量的contribution、cos2的图像。
2.5 绘制列变量的contribution、cos2的图像。
2.6 从图中可以发现哪些组别关系紧密?
习题3
提示:数据请自行录入,可在EXCEL录入好后,再导入R。
要求: 3.1 对行变量和列变量进行独立性检验,独立性检验的卡方检验统计量是多少?P值是什么,你的检验结论是什么?
3.2 绘制碎石图。你提取几个维度?每个维度的贡献分别是多少?
3.3 绘制对应分析图。
3.4 绘制行变量的contribution、cos2的图像。
3.5 绘制列变量的contribution、cos2的图像。
3.6 从图中可以发现哪些组别关系紧密?