【机器学习】聚类(Kmeans、MeanShift )

【机器学习】聚类(Kmeans、MeanShift ),第1张

参考

【参考:5-01 聚类算法kmeans和MeanShift原理 + 零售案例实战(上)_哔哩哔哩_bilibili】

【参考:机器学习/聚类/聚类算法 代码.ipynb · myaijarvis/AI - 码云 - 开源中国】

结论:

  • MeanShift比kmeans++效果要好点
导库
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import plotting
import seaborn as sns

plt.style.use("fivethirtyeight")

数据探索分析
# 2.1 读取文件
data = pd.read_csv("Mall_Customers.csv")



# 2.5 可视化:年收入分布,年龄分布
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['figure.figsize'] = (18, 8) # 画布大小

# 2.5.1 第一个图表
plt.subplot(1, 2, 1) # 1行2列1号位置
sns.set(style='whitegrid') # 格式设置
sns.distplot(data['Annual Income (k$)']) # 数据展示
plt.title('Distribution of Annual Income', fontsize=20) # 标题设置
plt.xlabel('Range of Annual Income') # 横轴
plt.ylabel('Count') # 纵轴

# 2.5.2 第二个图表
plt.subplot(1,2,2) # 1行2列2号位置
sns.set(style='whitegrid') # 格式设置
sns.distplot(data['Age'], color='red') # 数据展示
plt.title("Distribution of Age", fontsize=20) # 标题设置
plt.xlabel("Range of Age") # 横轴
plt.ylabel("Count") # 纵轴
plt.show() # 显示


# 2.7 性别分布可视化
labels = ['Female', 'Male'] # 性别标签
size = data['Gender'].value_counts() # 性别统计
colors = ['lightgreen', 'orange'] # 颜色配置
explode = [0, 0.1] # 饼图参数 间隔
plt.rcParams['figure.figsize'] = (9,9) # 画布大小
plt.pie(size, colors=colors, explode=explode, labels=labels, shadow=True, autopct='%.2f%%') # 饼图参数设置
plt.title('Gender', fontsize=20) # 标题
plt.axis('off') # 关闭坐标轴
plt.legend() # 显示标签
plt.show() # 显示图

# 2.8 将数据表中的年龄可视化
plt.rcParams['figure.figsize'] = (15,8) # 画布大小
sns.countplot(data['Age'], palette='hsv') # 以柱状图的形式展示每个类别的数量 palette颜色
plt.title("Distribution of Age", fontsize=20) # 标题
plt.show()

# 2.9 消费得分数据分布
plt.rcParams['figure.figsize'] = (20,8) # 画布大小
sns.countplot(data['Spending Score (1-100)'], palette='hsv')# 数据展示
plt.title("Distribution of Spending Score", fontsize=20) # 标题
plt.show()

# 2.10 数据之间的相关系数,热图heatMap展示
plt.rcParams['figure.figsize'] = (15, 8) # 画布大小
sns.heatmap(data.corr(), cmap='Wistia', annot=True) # 数据展示
plt.title("HeatMap for the Data", fontsize=20) # 标题
plt.show() 

# 2.11 性别 VS 消费得分 之间的数据关联性 展示
plt.rcParams['figure.figsize'] = (18, 7) # 画布大小
sns.boxenplot(data['Gender'], data['Spending Score (1-100)'], palette='hsv') # 数据抽取
plt.title("Gender vs Spending Score", fontsize=20) # 标题
plt.show() 

# 2.12 年收入 VS 年龄和消费得分之间的关系
x = data['Annual Income (k$)'] # 年收入
y = data['Age'] # 年龄
z = data['Spending Score (1-100)'] # 消费得分

sns.lineplot(x, y, color='blue') # 年收入 vs 年龄
sns.lineplot(x, z, color='black') # 年收入 vs 消费得分
plt.title("Annual Income vs Age and Spending Score", fontsize=20) # 标题
plt.show()

Kmeans
# 3.1 获取data的两个特征:Annual Income (k$) 和 Spending Score (1-100)

X = data.iloc[:, [3, 4]].values # 获取第3、4个特征数据

print(type(X))
print(X.shape)


(200, 2)

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score # 轮廓系数

# 3.2 模型创建、训练, 选取最优聚类参数K
scores = [] # 得分
range_values = np.arange(2, 10) # 初始聚类个数
for i in range_values:
    # 创建模型对象
    kmeans = KMeans(init='k-means++', n_clusters=i, # 聚类个数
                    max_iter=300, n_init=10, random_state=0)
    kmeans.fit(X)
    # 计算轮廓系数得分
    score = silhouette_score(X, kmeans.labels_, metric='euclidean',  # 欧氏距离
                             sample_size=len(X))
    scores.append(score)
    
# 3.3 绘制得分结果 : 柱状图
plt.figure()
plt.bar(range_values, scores, width=0.6, color='b', align='center')
plt.title('Silhouette score')
plt.show()

# 可以看到 n_clusters=5 时最好

# 3.4 绘制得分结果 :折线图
plt.plot(range_values, scores) # 数据
plt.title("Silhouette score") # 标题
plt.xlabel('Cluster') # 横轴
plt.ylabel('Score') # 纵轴
plt.show()

# 3.5 数据聚类,K=5
km = KMeans(init='k-means++',n_clusters=5,max_iter=300, n_init=10, random_state=0) # 聚类对象

y_means = km.fit_predict(X) # 训练 预测

# 3.6 绘制聚类结果
plt.scatter(X[:,0], X[:,1],
            marker='o',
            facecolors='b',
            edgecolors='k',
            s=30) # 绘制特征数据
centroids = km.cluster_centers_ # 质心

plt.scatter(X[y_means == 0, 0], X[y_means == 0, 1], s = 100, c = 'pink', label = 'miser') # 标签:0
plt.scatter(X[y_means == 1, 0], X[y_means == 1, 1], s = 100, c = 'yellow', label = 'general') # 标签:1
plt.scatter(X[y_means == 2, 0], X[y_means == 2, 1], s = 100, c = 'cyan', label = 'target') # 标签:2
plt.scatter(X[y_means == 3, 0], X[y_means == 3, 1], s = 100, c = 'magenta', label = 'spendthrift') # 标签:3
plt.scatter(X[y_means == 4, 0], X[y_means == 4, 1], s = 100, c = 'orange', label = 'careful') # 标签:4

plt.scatter(centroids[:,0], centroids[:,1], marker='*', s=260, linewidths=3, color='black', label='centroid') # 质心

x_min, x_max = min(X[:,0])-1, max(X[:,0])+1 # 横轴坐标范围
y_min, y_max = min(X[:,1])-1, max(X[:,1])+1 # 纵轴坐标范围
plt.title("K-Means clustering") # 标题
plt.xlim(x_min, x_max) # 横轴
plt.ylim(y_min, y_max) # 纵轴
plt.xticks(())
plt.yticks(())
plt.show()

Mean Shift
from sklearn.cluster import MeanShift, estimate_bandwidth

# 4.1 聚类算法实现

bandwidth = estimate_bandwidth(X, quantile=0.1) # 带宽(即半径),quantile默认为[0,1], 默认值为0.3 
ms = MeanShift(bandwidth=bandwidth).fit(X) # 聚类

# 3.6 绘制聚类结果
plt.scatter(X[:,0], X[:,1], marker='o', facecolors='b', edgecolors='k', s=30) # 绘制特征数据
centroids = ms.cluster_centers_ # 质心
plt.scatter(X[y_means == 0, 0], X[y_means == 0, 1], s = 100, c = 'pink', label = 'miser') # 标签:0
plt.scatter(X[y_means == 1, 0], X[y_means == 1, 1], s = 100, c = 'yellow', label = 'general') # 标签:1
plt.scatter(X[y_means == 2, 0], X[y_means == 2, 1], s = 100, c = 'cyan', label = 'target') # 标签:2
plt.scatter(X[y_means == 3, 0], X[y_means == 3, 1], s = 100, c = 'magenta', label = 'spendthrift') # 标签:3
plt.scatter(X[y_means == 4, 0], X[y_means == 4, 1], s = 100, c = 'orange', label = 'careful') # 标签:4
plt.scatter(centroids[:,0], centroids[:,1], marker='*', s=260, linewidths=3, color='black', label='centroid') # 质心
x_min, x_max = min(X[:,0])-1, max(X[:,0])+1 # 横轴坐标范围
y_min, y_max = min(X[:,1])-1, max(X[:,1])+1 # 纵轴坐标范围
plt.title("Mean Shift clustering") # 标题
plt.xlim(x_min, x_max) # 横轴
plt.ylim(y_min, y_max) # 纵轴
plt.xticks(())
plt.yticks(())
plt.show()

# 可以看到 质心多了几个 比k-means++算法好

选取三个特征展示

Age,Annual Income,Spending Score

X = data[['Age', 'Annual Income (k$)','Spending Score (1-100)']].values # 选取三个特征的数据
# 6.1 KMeans 算法
km = KMeans(init='k-means++', n_clusters=5, max_iter=300) # 聚类对象
km.fit(X) # 训练
# 6.2 获取聚类结果:标签和质心
labels = km.labels_ # 标签 聚类的结果

centroids = km.cluster_centers_ # 质心

# 6.4 数据可视化
# 三维
# pip install plotly
# pip install cufflinks

import plotly.graph_objs as go
import plotly.offline as po

trace1 = go.Scatter3d(
    x = data['Age'],
    y = data['Spending Score (1-100)'],
    z = data['Annual Income (k$)'],
    mode='markers',
    marker=dict(
        color=data['labels'],
        size=10,
        line=dict(
            color=data['labels'],
            width=12
        ),
        opacity=0.8
    )
)

df = [trace1]

layout = go.Layout(
    title = '',
    margin = dict(
        l=0,
        r=0,
        b=0,
        t=0
    ),
    scene = dict(
        xaxis = dict(title = 'Age'),
        yaxis = dict(title = 'Spending Score'),
        zaxis = dict(title = 'Annual Income')
    )
)

fig = go.Figure(data = df, layout = layout)
po.iplot(fig)

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/langs/716034.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-04-25
下一篇 2022-04-25

发表评论

登录后才能评论

评论列表(0条)

保存