本章介绍R中的典型相关分析(Canonical Correlation Analysis)。
#安装和加载包
#install.packages("CCA")
#install.packages("yacca")
1 第1步 定义两组变量
library(readr)
#读取数据文件
library(readxl)
<- read_excel("eg9.1.xls")
eg9_1
#对eg9_1重命名,去掉第1列,保存为数据框data
library(tidyverse)
<- eg9_1 %>% rename(
data weight = x1, waist = x2, pulse = x3,
chinup = y1, situp = y2, jump = y3) %>% select(-1)
#定义两组变量
<- data[,1:3]
physical <- data[,4:6] train
2 第2步: 计算典型相关分析模型
2.1 方法一:CCA::cc
library(CCA)
<- cc(physical, train)
res.cc
#查看典型相关系数
$cor res.cc
[1] 0.79560815 0.20055604 0.07257029
#查看典型变量与原始变量的关系
$xcoef res.cc
[,1] [,2] [,3]
weight 0.031404688 0.07631951 -0.007735047
waist -0.493241676 -0.36872299 0.158033647
pulse 0.008199315 0.03205199 0.145732242
$ycoef res.cc
[,1] [,2] [,3]
chinup 0.06611399 0.071041211 -0.245275347
situp 0.01684623 -0.001973745 0.019767637
jump -0.01397157 -0.020714106 -0.008167472
2.2 方法二:yacca::cca
library(yacca)
<- cca(physical, train)
res.cca
#查看结果
res.cca
Canonical Correlation Analysis
Canonical Correlations:
CV 1 CV 2 CV 3
0.79560815 0.20055604 0.07257029
X Coefficients:
CV 1 CV 2 CV 3
weight -0.031404688 -0.07631951 -0.007735047
waist 0.493241676 0.36872299 0.158033647
pulse -0.008199315 -0.03205199 0.145732242
Y Coefficients:
CV 1 CV 2 CV 3
chinup -0.06611399 -0.071041211 -0.245275347
situp -0.01684623 0.001973745 0.019767637
jump 0.01397157 0.020714106 -0.008167472
Structural Correlations (Loadings) - X Vars:
CV 1 CV 2 CV 3
weight 0.6206424 -0.7723919 -0.13495886
waist 0.9254249 -0.3776614 -0.03099486
pulse -0.3328481 0.0414842 0.94206752
Structural Correlations (Loadings) - Y Vars:
CV 1 CV 2 CV 3
chinup -0.7276254 0.2369522 -0.64375064
situp -0.8177285 0.5730231 0.05444915
jump -0.1621905 0.9586280 -0.23393722
Aggregate Redundancy Coefficients (Total Variance Explained):
X | Y: 0.2968779
Y | X: 0.2766555
3 第3步 检验相关系数的显著性
library(yacca)
<- cca(physical, train)
res.cca F.test.cca(res.cca)
F Test for Canonical Correlations (Rao's F Approximation)
Corr F Num df Den df Pr(>F)
CV 1 0.795608 2.048234 9.000000 34.223 0.06353 .
CV 2 0.200556 0.175782 4.000000 30.000 0.94912
CV 3 0.072570 0.084709 1.000000 16.000 0.77475
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
4 第4步 可视化
4.1 典型变量u1和典型变量v1的散点图
library(CCA)
<- cc(physical, train)
res.cc
<- data.frame(
res.cc.score physical_u1 = res.cc$scores$xscores[,1],
train_v1 = res.cc$scores$yscores[,1])
%>%
res.cc.score ggplot(aes(physical_u1,train_v1)) +
geom_point()
4.2 helio plot
#典型变量与原始变量的相关系数
library(yacca)
<- cca(physical, train)
res.cca
helio.plot(res.cca, x.name="Physical Variables",
y.name="Train Variables")
#Show variances on first canonical variate
helio.plot(res.cca, x.name="Physical Variables",
y.name="Train Variables",
type = "variance")
5 应用举例
#调用数据LifeCycleSavings
library(yacca)
data(LifeCycleSavings)
# LifeCycleSavings
# Format
# A data frame with 50 observations on 5 variables.
#
# [,1] sr numeric aggregate personal savings
# [,2] pop15 numeric % of population under 15
# [,3] pop75 numeric % of population over 75
# [,4] dpi numeric real per-capita disposable income
# [,5] ddpi numeric % growth rate of dpi
#定义两组变量
<- LifeCycleSavings[, 2:3]
pop <- LifeCycleSavings[, -(2:3)]
eco
#估计典型相关模型
<- cca(pop, eco)
cca.fit
#查看典型相关系数
$corr cca.fit
CV 1 CV 2
0.8247966 0.3652762
# 典型变量u1和典型变量v1的散点图
<- data.frame(
res.cc.score pop_u1 = res.cc$scores$xscores[,1],
eco_v1 = res.cc$scores$yscores[,1])
%>%
res.cc.score ggplot(aes(pop_u1,eco_v1)) +
geom_point()
#检验典型相关系数显著性
F.test.cca(cca.fit)
F Test for Canonical Correlations (Rao's F Approximation)
Corr F Num df Den df Pr(>F)
CV 1 0.82480 13.49772 6.00000 90 7.3e-11 ***
CV 2 0.36528 NaN 2.00000 NaN NaN
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#查看CCA模型估计结果
cca.fit
Canonical Correlation Analysis
Canonical Correlations:
CV 1 CV 2
0.8247966 0.3652762
X Coefficients:
CV 1 CV 2
pop15 -0.06377599 -0.2535544
pop75 0.34053260 -1.8221811
Y Coefficients:
CV 1 CV 2
sr 0.0592971550 0.2336554912
dpi 0.0009151786 -0.0005311762
ddpi 0.0291942000 -0.0858752749
Structural Correlations (Loadings) - X Vars:
CV 1 CV 2
pop15 -0.9829821 -0.1837015
pop75 0.9697929 -0.2439299
Structural Correlations (Loadings) - Y Vars:
CV 1 CV 2
sr 0.4910379 0.8557760
dpi 0.9545172 -0.2637266
ddpi 0.0473377 0.1407737
Aggregate Redundancy Coefficients (Total Variance Explained):
X | Y: 0.6547925
Y | X: 0.298336
#Show loadings on first canonical variate
helio.plot(cca.fit, x.name="Population Variables",
y.name="Economic Variables")
#Show variances on first canonical variate
helio.plot(cca.fit, cv=1, x.name="Population Variables",
y.name="Economic Variables", type="variance")