【参考:5-01 聚类算法kmeans和MeanShift原理 + 零售案例实战(上)_哔哩哔哩_bilibili】
【参考:机器学习/聚类/聚类算法 代码.ipynb · myaijarvis/AI - 码云 - 开源中国】
结论:
- MeanShift比kmeans++效果要好点
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import plotting
import seaborn as sns
plt.style.use("fivethirtyeight")
数据探索分析
# 2.1 读取文件
data = pd.read_csv("Mall_Customers.csv")
# 2.5 可视化:年收入分布,年龄分布
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (18, 8) # 画布大小
# 2.5.1 第一个图表
plt.subplot(1, 2, 1) # 1行2列1号位置
sns.set(style='whitegrid') # 格式设置
sns.distplot(data['Annual Income (k$)']) # 数据展示
plt.title('Distribution of Annual Income', fontsize=20) # 标题设置
plt.xlabel('Range of Annual Income') # 横轴
plt.ylabel('Count') # 纵轴
# 2.5.2 第二个图表
plt.subplot(1,2,2) # 1行2列2号位置
sns.set(style='whitegrid') # 格式设置
sns.distplot(data['Age'], color='red') # 数据展示
plt.title("Distribution of Age", fontsize=20) # 标题设置
plt.xlabel("Range of Age") # 横轴
plt.ylabel("Count") # 纵轴
plt.show() # 显示
# 2.7 性别分布可视化
labels = ['Female', 'Male'] # 性别标签
size = data['Gender'].value_counts() # 性别统计
colors = ['lightgreen', 'orange'] # 颜色配置
explode = [0, 0.1] # 饼图参数 间隔
plt.rcParams['figure.figsize'] = (9,9) # 画布大小
plt.pie(size, colors=colors, explode=explode, labels=labels, shadow=True, autopct='%.2f%%') # 饼图参数设置
plt.title('Gender', fontsize=20) # 标题
plt.axis('off') # 关闭坐标轴
plt.legend() # 显示标签
plt.show() # 显示图
# 2.8 将数据表中的年龄可视化
plt.rcParams['figure.figsize'] = (15,8) # 画布大小
sns.countplot(data['Age'], palette='hsv') # 以柱状图的形式展示每个类别的数量 palette颜色
plt.title("Distribution of Age", fontsize=20) # 标题
plt.show()
# 2.9 消费得分数据分布
plt.rcParams['figure.figsize'] = (20,8) # 画布大小
sns.countplot(data['Spending Score (1-100)'], palette='hsv')# 数据展示
plt.title("Distribution of Spending Score", fontsize=20) # 标题
plt.show()
# 2.10 数据之间的相关系数,热图heatMap展示
plt.rcParams['figure.figsize'] = (15, 8) # 画布大小
sns.heatmap(data.corr(), cmap='Wistia', annot=True) # 数据展示
plt.title("HeatMap for the Data", fontsize=20) # 标题
plt.show()
# 2.11 性别 VS 消费得分 之间的数据关联性 展示
plt.rcParams['figure.figsize'] = (18, 7) # 画布大小
sns.boxenplot(data['Gender'], data['Spending Score (1-100)'], palette='hsv') # 数据抽取
plt.title("Gender vs Spending Score", fontsize=20) # 标题
plt.show()
# 2.12 年收入 VS 年龄和消费得分之间的关系
x = data['Annual Income (k$)'] # 年收入
y = data['Age'] # 年龄
z = data['Spending Score (1-100)'] # 消费得分
sns.lineplot(x, y, color='blue') # 年收入 vs 年龄
sns.lineplot(x, z, color='black') # 年收入 vs 消费得分
plt.title("Annual Income vs Age and Spending Score", fontsize=20) # 标题
plt.show()
Kmeans
# 3.1 获取data的两个特征:Annual Income (k$) 和 Spending Score (1-100)
X = data.iloc[:, [3, 4]].values # 获取第3、4个特征数据
print(type(X))
print(X.shape)
(200, 2)
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score # 轮廓系数
# 3.2 模型创建、训练, 选取最优聚类参数K
scores = [] # 得分
range_values = np.arange(2, 10) # 初始聚类个数
for i in range_values:
# 创建模型对象
kmeans = KMeans(init='k-means++', n_clusters=i, # 聚类个数
max_iter=300, n_init=10, random_state=0)
kmeans.fit(X)
# 计算轮廓系数得分
score = silhouette_score(X, kmeans.labels_, metric='euclidean', # 欧氏距离
sample_size=len(X))
scores.append(score)
# 3.3 绘制得分结果 : 柱状图
plt.figure()
plt.bar(range_values, scores, width=0.6, color='b', align='center')
plt.title('Silhouette score')
plt.show()
# 可以看到 n_clusters=5 时最好
# 3.4 绘制得分结果 :折线图
plt.plot(range_values, scores) # 数据
plt.title("Silhouette score") # 标题
plt.xlabel('Cluster') # 横轴
plt.ylabel('Score') # 纵轴
plt.show()
# 3.5 数据聚类,K=5
km = KMeans(init='k-means++',n_clusters=5,max_iter=300, n_init=10, random_state=0) # 聚类对象
y_means = km.fit_predict(X) # 训练 预测
# 3.6 绘制聚类结果
plt.scatter(X[:,0], X[:,1],
marker='o',
facecolors='b',
edgecolors='k',
s=30) # 绘制特征数据
centroids = km.cluster_centers_ # 质心
plt.scatter(X[y_means == 0, 0], X[y_means == 0, 1], s = 100, c = 'pink', label = 'miser') # 标签:0
plt.scatter(X[y_means == 1, 0], X[y_means == 1, 1], s = 100, c = 'yellow', label = 'general') # 标签:1
plt.scatter(X[y_means == 2, 0], X[y_means == 2, 1], s = 100, c = 'cyan', label = 'target') # 标签:2
plt.scatter(X[y_means == 3, 0], X[y_means == 3, 1], s = 100, c = 'magenta', label = 'spendthrift') # 标签:3
plt.scatter(X[y_means == 4, 0], X[y_means == 4, 1], s = 100, c = 'orange', label = 'careful') # 标签:4
plt.scatter(centroids[:,0], centroids[:,1], marker='*', s=260, linewidths=3, color='black', label='centroid') # 质心
x_min, x_max = min(X[:,0])-1, max(X[:,0])+1 # 横轴坐标范围
y_min, y_max = min(X[:,1])-1, max(X[:,1])+1 # 纵轴坐标范围
plt.title("K-Means clustering") # 标题
plt.xlim(x_min, x_max) # 横轴
plt.ylim(y_min, y_max) # 纵轴
plt.xticks(())
plt.yticks(())
plt.show()
Mean Shift
from sklearn.cluster import MeanShift, estimate_bandwidth
# 4.1 聚类算法实现
bandwidth = estimate_bandwidth(X, quantile=0.1) # 带宽(即半径),quantile默认为[0,1], 默认值为0.3
ms = MeanShift(bandwidth=bandwidth).fit(X) # 聚类
# 3.6 绘制聚类结果
plt.scatter(X[:,0], X[:,1], marker='o', facecolors='b', edgecolors='k', s=30) # 绘制特征数据
centroids = ms.cluster_centers_ # 质心
plt.scatter(X[y_means == 0, 0], X[y_means == 0, 1], s = 100, c = 'pink', label = 'miser') # 标签:0
plt.scatter(X[y_means == 1, 0], X[y_means == 1, 1], s = 100, c = 'yellow', label = 'general') # 标签:1
plt.scatter(X[y_means == 2, 0], X[y_means == 2, 1], s = 100, c = 'cyan', label = 'target') # 标签:2
plt.scatter(X[y_means == 3, 0], X[y_means == 3, 1], s = 100, c = 'magenta', label = 'spendthrift') # 标签:3
plt.scatter(X[y_means == 4, 0], X[y_means == 4, 1], s = 100, c = 'orange', label = 'careful') # 标签:4
plt.scatter(centroids[:,0], centroids[:,1], marker='*', s=260, linewidths=3, color='black', label='centroid') # 质心
x_min, x_max = min(X[:,0])-1, max(X[:,0])+1 # 横轴坐标范围
y_min, y_max = min(X[:,1])-1, max(X[:,1])+1 # 纵轴坐标范围
plt.title("Mean Shift clustering") # 标题
plt.xlim(x_min, x_max) # 横轴
plt.ylim(y_min, y_max) # 纵轴
plt.xticks(())
plt.yticks(())
plt.show()
# 可以看到 质心多了几个 比k-means++算法好
选取三个特征展示
Age,Annual Income,Spending Score
X = data[['Age', 'Annual Income (k$)','Spending Score (1-100)']].values # 选取三个特征的数据
# 6.1 KMeans 算法
km = KMeans(init='k-means++', n_clusters=5, max_iter=300) # 聚类对象
km.fit(X) # 训练
# 6.2 获取聚类结果:标签和质心
labels = km.labels_ # 标签 聚类的结果
centroids = km.cluster_centers_ # 质心
# 6.4 数据可视化
# 三维
# pip install plotly
# pip install cufflinks
import plotly.graph_objs as go
import plotly.offline as po
trace1 = go.Scatter3d(
x = data['Age'],
y = data['Spending Score (1-100)'],
z = data['Annual Income (k$)'],
mode='markers',
marker=dict(
color=data['labels'],
size=10,
line=dict(
color=data['labels'],
width=12
),
opacity=0.8
)
)
df = [trace1]
layout = go.Layout(
title = '',
margin = dict(
l=0,
r=0,
b=0,
t=0
),
scene = dict(
xaxis = dict(title = 'Age'),
yaxis = dict(title = 'Spending Score'),
zaxis = dict(title = 'Annual Income')
)
)
fig = go.Figure(data = df, layout = layout)
po.iplot(fig)
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)