PART_1 使用sklearn转换器处理数据
# 加载datasets模块中数据集
from sklearn.datasets import load_breadt_cancer
cancer=load_breast_cancer()
cancer_data=cancer['data']
cancer_target=cancer['target']
cancer_names=cancer['feature_names']
cancer_desc=cancer['DESCR'] #描述数据集的信息
#将数据集划分为训练集和测试集
from sklearn.model_selection import train_test_split
cancer_data_train,cancer_data_test,cancer_target_train,cancer_target_test=train_test_split(cancer_data,cancer_target,test_size=0.2,random_state=42)
#PCA降维
from sklearn.decomposition import PCA
pca_model=PCA(n_components=10).fit(cancer_trainScaler) #测试规则
cancer_trainPCA=pca_model.transform(cancer_trainScaler) #将规则应用于训练集
cancer_testPCA=pca_model.transform(cancertestScaler) #将规则应用于测试集
PART_2 构建并评价聚类模型
#K-means聚类
#数据预处理
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
iris=load_iris() #导入数据
iris_data=iris['data']
iris_target=iris['target']
iris_names=iris['feature_names']
scal=MinMaxScaler().fit(iris_data) #训练规则
iris_dataScale=scale.transform(iris_data) #应用规则
kmeans=KMeans(n_cluster=3,random_state=123).fit(iris_dataScale) #构建并训练模型
result=kmeans.predict([1,1,1,1])
#多维数据的可视化——TSNE函数
import pandas as pd
from sklearn.manifod import TSNE
import matplotlib.pyplot as plt
##数据降维
tsne=TSNE(n_components=2,init='random',random_state=177).fit(iris_data)
df=pd.DataFrame(tsne.embedding_)
df['labels']=kmeans.labels_
##提取不同标签的数据
df1=df[df['labels']==0]
df2=df[df['labels']==1]
df3=df[df['labels']==2]
##绘制图形
fig=plt.figure(figsize=(9,6))
plt.plot(df1[0],df1[1],'bo',df2[0],df2[1],'r*',df3[0],df3[1],'gD')
plt.show
#聚类模型评价指标(有很多,这个是用函数实现的)
#FMI评价(from sklearn.metrics)
from sklearn.metrics import foelkes_mallows_score
for i in range(2,7):
kmeans=KMeans(n_cluster=i,random_state=123).fit(iris_data)
score=fowlke_mallows_score(iris_target,kmeans.labels_)
print('iris数据聚%d类FMI评价分值为:%f'%(i,score))
#轮廓系数评价
from sklearn.metrics import sihoutte_score
import matplotlib.pyplot as plt
silhouettteScore=[]
for i in range(2,15):
kmeans=KMeans(n_cluster=i,random_state=123).fit(iris_data)
score=silhoutttteScore(iris_target,kmeans.labels_) #
silhouettteScore.append(score)
plt.figure(figsize=(10,6))
plt.plot(range(2,15),silhouettteScore,linewidth=1.5,line)
plot.show
#Calinski_Harabasz评价
from sklearn.metrics import calinski_harabaz_score
score=calinski_harabaz_score(iris_target,kmeans.labels_) #其余的和上面相同
PART_3 构建并评价分类模型
#SVM分类模型
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
#将数据导入,并且安标签
#将数据划分为训练集测试集
cancer_data_train,cancer_data_test,cancer_target_train,cancer_target_test=train_test_split(cancer_data,cancer_target,test_size=0.2,random_state=22)
#数据标准化
stdScaler=StandardScaler().fit(cancer_data_train)
cancer_trainStd=stdScaler.transform(cancer_data_train)
cancer_testStd=stdScaler.transform(cancer_data_test)
#建立SVM模型
svm=SVC().fit(cancer_trainStd,cancer_target_train)
#预测训练集结果
cancer_target_pred=svm.predict(cancer_testStd)
#求出预测和真实一样的数目
true=np.sum(cancer_target_pred==cancer_target_test)
#评价分类模型
from sklearn.metrics import accuracy_score,presion_score,recall_score.fl_score,cohen_kappa_score
print(accuracy_score(cancer_target_test,cancer_target_pred))
print(precision_score(cancer_target_test,cancer_target_pred))
print(recall_score(cancer_target_test,cancer_target_pred))
print(cohen_kappa_score(cancer_target_test,cancer_target_pred))
from sklearn.metrics import classification_report
print(classification_report(cancer_target_test,cancer_target_pred))
#绘制ROC曲线(要理解它是什么,怎么判定分类是好的)
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
fpr,tpr,thresholds=roc_curve(cancer_target_test,cancer_target_pred)
plt.figure(figsize=(10,6))
plt.xlim(0,1)
plt.ylim(0.0,1.1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.plot(fpr,tpr,linewidth=2,linestyle='-',color='red')
plt.show()
PART_4 构建并评价回归模型
#评价线性回归模型
from sklearn.metrics import explained_variance_score,mean_absolue_error,mean_squared_error,median_absolue_error,r2_score
print(mean_absolute_error(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))
print(median_absolute_error(y_test,y_pred))
print(explained_variance_score(y_test,y_pred))
print(r2_score(y_test,y_pred))
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)