本章介绍R中的典型相关分析(Canonical Correlation Analysis)。

#安装和加载包
#install.packages("CCA")
#install.packages("yacca")

1 第1步 定义两组变量

library(readr)
#读取数据文件
library(readxl)
eg9_1 <- read_excel("eg9.1.xls")

#对eg9_1重命名,去掉第1列,保存为数据框data
library(tidyverse)
data <- eg9_1 %>% rename(
  weight = x1,  waist = x2,  pulse = x3, 
  chinup = y1,   situp = y2,   jump = y3) %>% select(-1)
#定义两组变量
physical <- data[,1:3]
train <- data[,4:6]

2 第2步: 计算典型相关分析模型

2.1 方法一:CCA::cc

library(CCA)
res.cc <- cc(physical, train)

#查看典型相关系数
res.cc$cor
[1] 0.79560815 0.20055604 0.07257029
#查看典型变量与原始变量的关系
res.cc$xcoef
               [,1]        [,2]         [,3]
weight  0.031404688  0.07631951 -0.007735047
waist  -0.493241676 -0.36872299  0.158033647
pulse   0.008199315  0.03205199  0.145732242
res.cc$ycoef
              [,1]         [,2]         [,3]
chinup  0.06611399  0.071041211 -0.245275347
situp   0.01684623 -0.001973745  0.019767637
jump   -0.01397157 -0.020714106 -0.008167472

2.2 方法二:yacca::cca

library(yacca)
res.cca <- cca(physical, train)

#查看结果
res.cca

Canonical Correlation Analysis

Canonical Correlations:
      CV 1       CV 2       CV 3 
0.79560815 0.20055604 0.07257029 

X Coefficients:
               CV 1        CV 2         CV 3
weight -0.031404688 -0.07631951 -0.007735047
waist   0.493241676  0.36872299  0.158033647
pulse  -0.008199315 -0.03205199  0.145732242

Y Coefficients:
              CV 1         CV 2         CV 3
chinup -0.06611399 -0.071041211 -0.245275347
situp  -0.01684623  0.001973745  0.019767637
jump    0.01397157  0.020714106 -0.008167472

Structural Correlations (Loadings) - X Vars:
             CV 1       CV 2        CV 3
weight  0.6206424 -0.7723919 -0.13495886
waist   0.9254249 -0.3776614 -0.03099486
pulse  -0.3328481  0.0414842  0.94206752

Structural Correlations (Loadings) - Y Vars:
             CV 1      CV 2        CV 3
chinup -0.7276254 0.2369522 -0.64375064
situp  -0.8177285 0.5730231  0.05444915
jump   -0.1621905 0.9586280 -0.23393722

Aggregate Redundancy Coefficients (Total Variance Explained):
    X | Y: 0.2968779 
    Y | X: 0.2766555 

3 第3步 检验相关系数的显著性

library(yacca)
res.cca <- cca(physical, train)
F.test.cca(res.cca)

    F Test for Canonical Correlations (Rao's F Approximation)

         Corr        F   Num df Den df  Pr(>F)  
CV 1 0.795608 2.048234 9.000000 34.223 0.06353 .
CV 2 0.200556 0.175782 4.000000 30.000 0.94912  
CV 3 0.072570 0.084709 1.000000 16.000 0.77475  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4 第4步 可视化

4.1 典型变量u1和典型变量v1的散点图

library(CCA)
res.cc <- cc(physical, train)

res.cc.score <- data.frame(
  physical_u1 = res.cc$scores$xscores[,1], 
  train_v1 = res.cc$scores$yscores[,1])

res.cc.score %>% 
  ggplot(aes(physical_u1,train_v1)) +
  geom_point()

4.2 helio plot

#典型变量与原始变量的相关系数
library(yacca)
res.cca <- cca(physical, train)

helio.plot(res.cca, x.name="Physical Variables",
           y.name="Train Variables")

#Show variances on first canonical variate
helio.plot(res.cca, x.name="Physical Variables",
           y.name="Train Variables", 
           type = "variance")

5 应用举例

#调用数据LifeCycleSavings
library(yacca)
data(LifeCycleSavings)

# LifeCycleSavings
# Format
# A data frame with 50 observations on 5 variables.
# 
# [,1]  sr  numeric aggregate personal savings
# [,2]  pop15   numeric % of population under 15
# [,3]  pop75   numeric % of population over 75
# [,4]  dpi numeric real per-capita disposable income
# [,5]  ddpi    numeric % growth rate of dpi

#定义两组变量
pop <- LifeCycleSavings[, 2:3]
eco <- LifeCycleSavings[, -(2:3)]

#估计典型相关模型
cca.fit <- cca(pop, eco)

#查看典型相关系数
cca.fit$corr
     CV 1      CV 2 
0.8247966 0.3652762 
# 典型变量u1和典型变量v1的散点图
res.cc.score <- data.frame(
  pop_u1 = res.cc$scores$xscores[,1], 
  eco_v1 = res.cc$scores$yscores[,1])

res.cc.score %>% 
  ggplot(aes(pop_u1,eco_v1)) +
  geom_point()

#检验典型相关系数显著性
F.test.cca(cca.fit)

    F Test for Canonical Correlations (Rao's F Approximation)

         Corr        F   Num df Den df  Pr(>F)    
CV 1  0.82480 13.49772  6.00000     90 7.3e-11 ***
CV 2  0.36528      NaN  2.00000    NaN     NaN    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#查看CCA模型估计结果
cca.fit

Canonical Correlation Analysis

Canonical Correlations:
     CV 1      CV 2 
0.8247966 0.3652762 

X Coefficients:
             CV 1       CV 2
pop15 -0.06377599 -0.2535544
pop75  0.34053260 -1.8221811

Y Coefficients:
             CV 1          CV 2
sr   0.0592971550  0.2336554912
dpi  0.0009151786 -0.0005311762
ddpi 0.0291942000 -0.0858752749

Structural Correlations (Loadings) - X Vars:
            CV 1       CV 2
pop15 -0.9829821 -0.1837015
pop75  0.9697929 -0.2439299

Structural Correlations (Loadings) - Y Vars:
          CV 1       CV 2
sr   0.4910379  0.8557760
dpi  0.9545172 -0.2637266
ddpi 0.0473377  0.1407737

Aggregate Redundancy Coefficients (Total Variance Explained):
    X | Y: 0.6547925 
    Y | X: 0.298336 
#Show loadings on first canonical variate
helio.plot(cca.fit, x.name="Population Variables",
           y.name="Economic Variables")

#Show variances on first canonical variate
helio.plot(cca.fit, cv=1, x.name="Population Variables",
           y.name="Economic Variables", type="variance")