import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#实现概率质量函数
from scipy.special import comb
import math
def ensemble_error(n_classifier,error):
k_start=int(math.ceil(n_classifier/2))
probs=[comb(n_classifier,k)*(error**k)*(1-error)**(n_classifier-k)
for k in range(k_start,n_classifier+1)]
return sum(probs)
ensemble_error(n_classifier=11,error=0.25)
0.03432750701904297
#绘制集成错误率与基本分类器错误率关系
#得到集成错误率
error_range=np.arange(0.0,1.01,0.01)
ens_errors=[ensemble_error(n_classifier=11,error=err) for err in error_range]
#绘图
plt.plot(error_range,ens_errors,label='Ensemble erroe',linewidth=2)
plt.plot(error_range,error_range,linestyle='--',
label='Base error',linewidth=2)
plt.xlabel('base error')
plt.ylabel('base/ensemble error')
plt.legend(loc='best')
plt.grid()
plt.show()
np.argmax(np.bincount([0,0,1],weights=[0.2,0.2,0.6]))
1
np.bincount([0,0,1],weights=[0.2,0.2,0.6])
array([0.4, 0.6])
#基于分类概率
ex=np.array([[0.9,0.1],
[0.8,0.2],
[0.4,0.6]])
ex
array([[0.9, 0.1],
[0.8, 0.2],
[0.4, 0.6]])
p=np.average(ex,axis=0,weights=[0.2,0.2,0.6])
p
array([0.58, 0.42])
#多数投票
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.preprocessing import LabelEncoder
import six
from sklearn.base import clone
from sklearn.pipeline import _name_estimators
import operator
class MajorityVoteClassifier(BaseEstimator,ClassifierMixin):
"""
多数票集成分类器
parameters
-------------
Classifiers:array,shape=[n_classifiers]
vote:str{'classlabel ','probability'}
Defult:'classlabel'
if 'classlabel',prediction基于类别标签的%alias大值的索引
if'probability',概率的总和的最大值被用来索引被用来预测标签
weights:array,shape=[n_classifiers]
optional,default:None
‘int’或者‘flaot’的列表被提供,那么分类器赋予权重,按照重要性。
如果‘wights=None’,则权重均匀
"""
def __init__(self,classifiers,vote='classlabel',weights=None):
self.classifiers=classifiers
self.named_classifiers={key: value for key,
value in _name_estimators(classifiers)}
self.vote=vote
self.weights=weights
def fit(self,X,y):
"""
fit classifiers
parametes
------------
X:array,shape=[n_samples,n_features]
y:array,shape=[n_samples]
returns
___________
self:objects
"""
self.lablenc_=LabelEncoder()#标签编码,从0开始
self.lablenc_.fit(y)
self.classes_=self.lablenc_.classes_
self.classifiers_=[]
for clf in self.classifiers:
fitted_clf=clone(clf).fit(X,self.lablenc_.transform(y))#clone构建一个相同参数的calssifiers
self.classifiers_.append(fitted_clf)#一些fitted的分类器
return self
def predict(self,X):
"""
parametes
___________
X:array,shape=[n_samples,n_features]
returns
___________
mai_vote:array,shape=[n_samples]
预测的类的标签
"""
if self.vote=='probability':
maj_vote=np.argmax(self.predict_proba(X),axis=1)#返回最大值的索引
else:
predictions=np.asarray([clf.predict(X) for clf in self.classifiers_]).T
#转置后,每行对应三个分类器对某个样本的预测类别
maj_vote=np.apply_along_axis(lambda x:np.argmax(np.bincount(x,weights=self.weights)),
axis=1,
arr=predictions)
maj_vote=self.lablenc_.inverse_transform(maj_vote)#反转化,获得原始标签
return maj_vote
def predict_proba(self,X):
"""
parametes
___________
X:array,shape=[n_samples,n_features]
returns
------------
avg_proda:array
shape=[n_samples,n_classes]
对于每个样本属于每个类别的加权平均概率
"""
probas=np.asarray([clf.predict_proba(X) for clf in self.classifiers_])
avg_proba=np.average(probas,axis=0,weights=self.weights)
return avg_proba
def get_params(self,deep=True):
"""
为了网格搜索获得分类器参数
"""
if not deep:
return super(MajorityVoteClassifier,self).get_params(deep=False) #调用父类的方法
else:
out=self.named_classifiers.copy()
for name,step in six.iteritems(self.named_classifiers):
for key,value in six.iteritems(step.get_params(deep=True)):
out['%s_%s'%(name,key)]=value
return out
用多数票原则进行预测
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
#读取鸢尾花数据集
iris=datasets.load_iris()
X,y=iris.data[50:,[1,2]],iris.target[50:]
#转换标签
le=LabelEncoder()
y=le.fit_transform(y)
#划分数据集
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.5,random_state=1,
stratify=y)
使用不同分类器:逻辑斯蒂分类器,决策树分类器,kNN分类器
##交叉验证
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
clf1=LogisticRegression(penalty='l2',
C=0.001,
random_state=1)
clf2=DecisionTreeClassifier(max_depth=1,
criterion='entropy',
random_state=0)
clf3=KNeighborsClassifier(n_neighbors=1,
p=2,
metric='minkowski')
pipeline与make_pipeline区别在make_pipeline会自动创建steps的名字
pipe1=Pipeline(steps=[('sc',StandardScaler()),
('clf',clf1)])
决策树不需要标准化
pipe3=Pipeline(steps=[('sc',StandardScaler()),
('clf',clf3)])
clf_labels=['Logistic regression','Decision tree','KNN']
print('10-fold cross validation:\n')
for clf,label in zip([pipe1,clf2,pipe3],clf_labels):
scores=cross_val_score(estimator=clf,
X=X_train,
y=y_train,
cv=10,
scoring='roc_auc')
print("ROC_AUC:%0.2f (+/- %0.2f) [%s]"
%(scores.mean(),scores.std(),label))
10-fold cross validation:
ROC_AUC:0.92 (+/- 0.15) [Logistic regression]
ROC_AUC:0.87 (+/- 0.18) [Decision tree]
ROC_AUC:0.85 (+/- 0.13) [KNN]
使用MajorityVoteClassifier
mv_clf=MajorityVoteClassifier(classifiers=[pipe1,clf2,pipe3])
clf_labels+=['Majior voting']
all_clf=[pipe1,clf2,pipe3,mv_clf]
for clf,label in zip(all_clf,clf_labels):
scores=cross_val_score(estimator=clf,
X=X_train,
y=y_train,
cv=10,
scoring='roc_auc')
print("ROC_AUC: %0.2f (+/-%0.2f) [%s]"
%(scores.mean(),scores.std(),label))
ROC_AUC: 0.92 (+/-0.15) [Logistic regression]
ROC_AUC: 0.87 (+/-0.18) [Decision tree]
ROC_AUC: 0.85 (+/-0.13) [KNN]
ROC_AUC: 0.98 (+/-0.05) [Majior voting]
评估和优化集成分类器
#对集成分类器评估
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
colors=['black','orange','blue','green']
linestyles=[':','--','-.','-']
for clf,label,clr,ls in zip (all_clf,clf_labels,colors,linestyles):
#假定positive class 是1
y_pred=clf.fit(X_train,y_train).predict_proba(X_test)[:,1]
fpr,tpr,thresholds=roc_curve(y_true=y_test,y_score=y_pred)
roc_auc=auc(x=fpr,y=tpr)
plt.plot(fpr,tpr,
color=clr,
linestyle=ls,
label="%s(auc=%0.2f)"%(label,roc_auc))
plt.legend(loc='lower right')
plt.grid(alpha=0.7)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.show()
#绘出分类器决策区域
sc=StandardScaler()
X_train_std=sc.fit_transform(X_train)#使决策树和其他模型有相同的比例尺实现可视化
from itertools import product
x_min=X_train_std[:,0].min()-1
x_max=X_train_std[:,0].max()+1
y_min=X_train_std[:,1].min()-1
y_max=X_train_std[:,1].max()+1
np.c 中的c 是 column(列)的缩写,就是按列叠加两个矩阵,就是把两个矩阵左右组合,要求行数相等。
xx,yy=np.meshgrid(np.arange(x_min,x_max,0.1),
np.arange(y_min,y_max,0.1))#获得坐标矩阵,xx储存横坐标,yy储存纵坐标
f,axarr=plt.subplots(nrows=2,ncols=2,
sharex='col',
sharey='row',
figsize=(7,5))
#sharex只有最下行有x刻度,sharey只有最左列有y刻度
#row:每一行共享
#col:每一列共享
for idx,clf,tt in zip(product([0,1],[0,1]),
all_clf,clf_labels):
#product(list1,list2)从list1提取元素,与list2中的每个元素组成元组
clf.fit(X_train_std,y_train)
z=clf.predict(np.c_[xx.ravel(),yy.ravel()])#预测铺满图所有的点是y=0,还是y=1
z=z.reshape(xx.shape)
axarr[idx[0],idx[1]].contourf(xx,yy,z,alpha=0.3)
axarr[idx[0],idx[1]].scatter(X_train_std[y_train==0,0],
X_train_std[y_train==0,1],
color='blue',
marker='^',
s=50)
axarr[idx[0],idx[1]].scatter(X_train_std[y_train==1,0],
X_train_std[y_train==1,1],
color='green',
marker='o',
s=50)
axarr[idx[0],idx[1]].set_title(tt)
plt.text(-3.5,-4.5,s='width',
ha='center',va='center',
fontsize=12)
plt.text(-13.5,4.5,
s='petal length',
ha='center',
va='center',
fontsize=12,
rotation=90)
plt.show()
#调参
mv_clf.get_params()
{'pipeline-1': Pipeline(steps=[('sc', StandardScaler()),
('clf', LogisticRegression(C=0.001, random_state=1))]),
'decisiontreeclassifier': DecisionTreeClassifier(criterion='entropy', max_depth=1, random_state=0),
'pipeline-2': Pipeline(steps=[('sc', StandardScaler()),
('clf', KNeighborsClassifier(n_neighbors=1))]),
'pipeline-1_memory': None,
'pipeline-1_steps': [('sc', StandardScaler()),
('clf', LogisticRegression(C=0.001, random_state=1))],
'pipeline-1_verbose': False,
'pipeline-1_sc': StandardScaler(),
'pipeline-1_clf': LogisticRegression(C=0.001, random_state=1),
'pipeline-1_sc__copy': True,
'pipeline-1_sc__with_mean': True,
'pipeline-1_sc__with_std': True,
'pipeline-1_clf__C': 0.001,
'pipeline-1_clf__class_weight': None,
'pipeline-1_clf__dual': False,
'pipeline-1_clf__fit_intercept': True,
'pipeline-1_clf__intercept_scaling': 1,
'pipeline-1_clf__l1_ratio': None,
'pipeline-1_clf__max_iter': 100,
'pipeline-1_clf__multi_class': 'auto',
'pipeline-1_clf__n_jobs': None,
'pipeline-1_clf__penalty': 'l2',
'pipeline-1_clf__random_state': 1,
'pipeline-1_clf__solver': 'lbfgs',
'pipeline-1_clf__tol': 0.0001,
'pipeline-1_clf__verbose': 0,
'pipeline-1_clf__warm_start': False,
'decisiontreeclassifier_ccp_alpha': 0.0,
'decisiontreeclassifier_class_weight': None,
'decisiontreeclassifier_criterion': 'entropy',
'decisiontreeclassifier_max_depth': 1,
'decisiontreeclassifier_max_features': None,
'decisiontreeclassifier_max_leaf_nodes': None,
'decisiontreeclassifier_min_impurity_decrease': 0.0,
'decisiontreeclassifier_min_samples_leaf': 1,
'decisiontreeclassifier_min_samples_split': 2,
'decisiontreeclassifier_min_weight_fraction_leaf': 0.0,
'decisiontreeclassifier_random_state': 0,
'decisiontreeclassifier_splitter': 'best',
'pipeline-2_memory': None,
'pipeline-2_steps': [('sc', StandardScaler()),
('clf', KNeighborsClassifier(n_neighbors=1))],
'pipeline-2_verbose': False,
'pipeline-2_sc': StandardScaler(),
'pipeline-2_clf': KNeighborsClassifier(n_neighbors=1),
'pipeline-2_sc__copy': True,
'pipeline-2_sc__with_mean': True,
'pipeline-2_sc__with_std': True,
'pipeline-2_clf__algorithm': 'auto',
'pipeline-2_clf__leaf_size': 30,
'pipeline-2_clf__metric': 'minkowski',
'pipeline-2_clf__metric_params': None,
'pipeline-2_clf__n_jobs': None,
'pipeline-2_clf__n_neighbors': 1,
'pipeline-2_clf__p': 2,
'pipeline-2_clf__weights': 'uniform'}
from sklearn.model_selection import GridSearchCV
#优化逻辑回顾和决策树参数
params={'decisiontreeclassifier_max_depth':[1,2],
'pipeline-1_clf__C':[0.001,0.1,100.0]}
grid=GridSearchCV(estimator=mv_clf,
param_grid=params,
cv=10,
scoring='roc_auc')
grid.fit(X_train,y_train)
GridSearchCV(cv=10,
estimator=MajorityVoteClassifier(classifiers=[Pipeline(steps=[('sc',
StandardScaler()),
('clf',
LogisticRegression(C=0.001,
random_state=1))]),
DecisionTreeClassifier(criterion='entropy',
max_depth=1,
random_state=0),
Pipeline(steps=[('sc',
StandardScaler()),
('clf',
KNeighborsClassifier(n_neighbors=1))])]),
param_grid={'decisiontreeclassifier_max_depth': [1, 2],
'pipeline-1_clf__C': [0.001, 0.1, 100.0]},
scoring='roc_auc')
grid.cv_results_
{'mean_fit_time': array([0.0076185 , 0.00660717, 0.00634389, 0.00440733, 0.00522645,
0.00517545]),
'std_fit_time': array([0.00748017, 0.00485841, 0.00090428, 0.00044422, 0.00073637,
0.00052229]),
'mean_score_time': array([0.00169656, 0.00185349, 0.00189853, 0.00183671, 0.00169687,
0.00145319]),
'std_score_time': array([0.000457 , 0.00053638, 0.00069667, 0.00033673, 0.00063198,
0.00047215]),
'param_decisiontreeclassifier_max_depth': masked_array(data=[1, 1, 1, 2, 2, 2],
mask=[False, False, False, False, False, False],
fill_value='?',
dtype=object),
'param_pipeline-1_clf__C': masked_array(data=[0.001, 0.1, 100.0, 0.001, 0.1, 100.0],
mask=[False, False, False, False, False, False],
fill_value='?',
dtype=object),
'params': [{'decisiontreeclassifier_max_depth': 1,
'pipeline-1_clf__C': 0.001},
{'decisiontreeclassifier_max_depth': 1, 'pipeline-1_clf__C': 0.1},
{'decisiontreeclassifier_max_depth': 1, 'pipeline-1_clf__C': 100.0},
{'decisiontreeclassifier_max_depth': 2, 'pipeline-1_clf__C': 0.001},
{'decisiontreeclassifier_max_depth': 2, 'pipeline-1_clf__C': 0.1},
{'decisiontreeclassifier_max_depth': 2, 'pipeline-1_clf__C': 100.0}],
'split0_test_score': array([1., 1., 1., 1., 1., 1.]),
'split1_test_score': array([1., 1., 1., 1., 1., 1.]),
'split2_test_score': array([1., 1., 1., 1., 1., 1.]),
'split3_test_score': array([1., 1., 1., 1., 1., 1.]),
'split4_test_score': array([0.83333333, 0.83333333, 0.66666667, 0.83333333, 0.83333333,
0.66666667]),
'split5_test_score': array([1., 1., 1., 1., 1., 1.]),
'split6_test_score': array([1., 1., 1., 1., 1., 1.]),
'split7_test_score': array([1., 1., 1., 1., 1., 1.]),
'split8_test_score': array([1., 1., 1., 1., 1., 1.]),
'split9_test_score': array([1., 1., 1., 1., 1., 1.]),
'mean_test_score': array([0.98333333, 0.98333333, 0.96666667, 0.98333333, 0.98333333,
0.96666667]),
'std_test_score': array([0.05, 0.05, 0.1 , 0.05, 0.05, 0.1 ]),
'rank_test_score': array([1, 1, 5, 1, 1, 5])}
#得到优化后的不同超参数组合
for i,_ in enumerate(grid.cv_results_['mean_test_score']):
print("%0.3f +/- %0.2f %r"
%(grid.cv_results_['mean_test_score'][i],
grid.cv_results_['std_test_score'][i],
grid.cv_results_['params'][i]))
0.983 +/- 0.05 {'decisiontreeclassifier_max_depth': 1, 'pipeline-1_clf__C': 0.001}
0.983 +/- 0.05 {'decisiontreeclassifier_max_depth': 1, 'pipeline-1_clf__C': 0.1}
0.967 +/- 0.10 {'decisiontreeclassifier_max_depth': 1, 'pipeline-1_clf__C': 100.0}
0.983 +/- 0.05 {'decisiontreeclassifier_max_depth': 2, 'pipeline-1_clf__C': 0.001}
0.983 +/- 0.05 {'decisiontreeclassifier_max_depth': 2, 'pipeline-1_clf__C': 0.1}
0.967 +/- 0.10 {'decisiontreeclassifier_max_depth': 2, 'pipeline-1_clf__C': 100.0}
for _ in …理解
https://blog.csdn.net/hello_world_blog/article/details/89314816
print("best parameters: %s"%(grid.best_params_))
best parameters: {'decisiontreeclassifier_max_depth': 1, 'pipeline-1_clf__C': 0.001}
print('accuracy: %0.2f'%(grid.best_score_))
accuracy: 0.98
7.3套袋法:基于样本构建分类器集成
df_wine=pd.read_csv('wine-Copy1.data',header=None,names=['label',
'alcohol',
'malic acid',
'ash',
'alcalinity of ash',
'magnesium',
'total phenols',
'flavanids',
'nonflavanoid phenols',
'proanthocyanins',
'color intensity',
'hue',
'OD280',
'proline'])
df_wine.head()
label | alcohol | malic acid | ash | alcalinity of ash | magnesium | total phenols | flavanids | nonflavanoid phenols | proanthocyanins | color intensity | hue | OD280 | proline | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 14.23 | 1.71 | 2.43 | 15.6 | 127 | 2.80 | 3.06 | 0.28 | 2.29 | 5.64 | 1.04 | 3.92 | 1065 |
1 | 1 | 13.20 | 1.78 | 2.14 | 11.2 | 100 | 2.65 | 2.76 | 0.26 | 1.28 | 4.38 | 1.05 | 3.40 | 1050 |
2 | 1 | 13.16 | 2.36 | 2.67 | 18.6 | 101 | 2.80 | 3.24 | 0.30 | 2.81 | 5.68 | 1.03 | 3.17 | 1185 |
3 | 1 | 14.37 | 1.95 | 2.50 | 16.8 | 113 | 3.85 | 3.49 | 0.24 | 2.18 | 7.80 | 0.86 | 3.45 | 1480 |
4 | 1 | 13.24 | 2.59 | 2.87 | 21.0 | 118 | 2.80 | 2.69 | 0.39 | 1.82 | 4.32 | 1.04 | 2.93 | 735 |
#y选择2,3类
df_wine=df_wine[df_wine['label']!=1]
y=df_wine['label'].values
X=df_wine[['alcohol','OD280']].values
#分类标签编码
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=le.fit_transform(y)
y
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)
#划分训练集和测试集
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,
test_size=0.2,
stratify=y,
random_state=1)
#实现baggingClassifier算法
from sklearn.ensemble import BaggingClassifier
tree=DecisionTreeClassifier(criterion='entropy',
random_state=1,
max_depth=None)
#创建500棵决策树的组合模型b
bag=BaggingClassifier(base_estimator=tree,
n_estimators=500,
max_samples=1.0,#1.0浮点数
max_features=1.0,
bootstrap=True,
bootstrap_features=False,
n_jobs=1,
random_state=1)
#计算单一决策树在训练集和测试集的准确率
from sklearn.metrics import accuracy_score
tree.fit(X_train,y_train)
y_train_pred=tree.predict(X_train)
y_test_pred=tree.predict(X_test)
tree_train=accuracy_score(y_train,y_train_pred)#计算在训练集上的准确率
tree_test=accuracy_score(y_test,y_test_pred)#计算在测试集上的准确率
print("decsion tree train/test accuracy %0.3f/%0.3f"%(tree_train,tree_test))#overfitting
decsion tree train/test accuracy 1.000/0.833
#计算bagging集成方法在训练集和测试集上的准确率
bag=bag.fit(X_train,y_train)
y_train_pred=bag.predict(X_train)
y_test_pred=bag.predict(X_test)
bag_train=accuracy_score(y_train,y_train_pred)
bag_test=accuracy_score(y_test,y_test_pred)
print('bagging train/set accuracy: %0.3f/%0.3f'%(bag_train,bag_test))
bagging train/set accuracy: 1.000/0.917
#训练集上决策区域可视化
x_min=X_train[:,0].min()-1
x_max=X_train[:,0].max()+1
y_min=X_train[:,1].min()-2
y_max=X_train[:,1].max()+2
xx,yy=np.meshgrid(np.arange(x_min,x_max,0.1),
np.arange(y_min,y_max,0.1))#获得坐标矩阵,xx储存横坐标,yy储存纵坐标
f,axarr=plt.subplots(nrows=1,ncols=2,
sharex='col',
sharey='row',
figsize=(8,3))
#sharex只有最下行有x刻度,sharey只有最左列有y刻度
#row:每一行共享
#col:每一列共享
for idx,clf,tt in zip([0,1],
[tree,bag],['decision tree','bagging']):
#product(list1,list2)从list1提取元素,与list2中的每个元素组成元组
clf.fit(X_train,y_train)
z=clf.predict(np.c_[xx.ravel(),yy.ravel()])#预测铺满图所有的点是y=0,还是y=1
z=z.reshape(xx.shape)
axarr[idx].contourf(xx,yy,z,alpha=0.3)
axarr[idx].scatter(X_train[y_train==0,0],
X_train[y_train==0,1],
color='blue',
marker='^',
s=50)
axarr[idx].scatter(X_train[y_train==1,0],
X_train[y_train==1,1],
color='green',
marker='o',
s=50)
axarr[idx].set_title(tt)
axarr[0].set_ylabel('Alcohol',fontsize=12)
plt.text(9.5,-2,s='OD280',ha='center',va='center',fontsize=12)
plt.show()
7.4通过AdaBoost集成
#数据:使用葡萄酒数据
from sklearn.ensemble import AdaBoostClassifier
tree=DecisionTreeClassifier(criterion='entropy',
random_state=1,
max_depth=1)
ada=AdaBoostClassifier(base_estimator=tree,
n_estimators=500,
learning_rate=0.1,
random_state=1)#集成500课决策树
tree.fit(X_train,y_train)
y_train_pred=tree.predict(X_train)
y_test_pred=tree.predict(X_test)
tree_train=accuracy_score(y_train,y_train_pred)
tree_test=accuracy_score(y_test,y_test_pred)
print("Decison Tree train/test accuracy: %0.3f/%0.3f"%(tree_train,tree_test))
Decison Tree train/test accuracy: 0.916/0.875
#学习曲线判断欠拟合还是过拟合
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
train_sizes,train_scores,test_scores=learning_curve(estimator=tree,
X=X_train,
y=y_train,
train_sizes=np.linspace(0.1,1,10),
cv=10,
n_jobs=1)
train_mean=np.mean(train_scores,axis=1)
test_mean=np.mean(test_scores,axis=1)
plt.plot(train_sizes,train_mean,color='g',marker='o',
label='training accuracy')
plt.plot(train_sizes,test_mean,color='r',marker='^',
label='validation accuracy')
plt.legend(loc='best')
plt.show()
欠拟合:train和val曲线并没有平滑,还在上下波动,也就是模型并没有收敛,如果模型收敛的话,曲线末端应该是接近平滑的直线的
一般训练集上的准确度大于0.9,验证集或测试集上的准确度小于0.9很多,就是过拟合了。
训练集上的准确度小于0.9很多,验证集或测试集上的准确度也小于0.9很多,就是欠拟合。
#使用ada算法
ada.fit(X_train,y_train)
y_train_pred=ada.predict(X_train)
y_test_pred=ada.predict(X_test)
ada_train=accuracy_score(y_train,y_train_pred)
ada_test=accuracy_score(y_test,y_test_pred)
print("Adaboost train/test accuracy:%0.3f/%0.3f"%(ada_train,ada_test))
Adaboost train/test accuracy:1.000/0.917
train_sizes1,train_scores1,test_scores1=learning_curve(estimator=ada,
X=X_train,
y=y_train,
train_sizes=np.linspace(0.1,1,10),
cv=10,
n_jobs=1)
train_mean1=np.mean(train_scores1,axis=1)
test_mean1=np.mean(test_scores1,axis=1)
plt.plot(train_sizes1,train_mean1,color='k',marker='o',
label='training accuracy')
plt.plot(train_sizes1,test_mean1,color='b',marker='^',
label='validation accuracy')
plt.title('Adaboost learning curve')
plt.xlabel('numbers of samples')
plt.ylabel('accuracy')
plt.legend(loc='best')
plt.show()
感觉有些欠拟合
#训练集上决策区域可视化
x_min=X_train[:,0].min()-1
x_max=X_train[:,0].max()+1
y_min=X_train[:,1].min()-2
y_max=X_train[:,1].max()+2
xx,yy=np.meshgrid(np.arange(x_min,x_max,0.1),
np.arange(y_min,y_max,0.1))#获得坐标矩阵,xx储存横坐标,yy储存纵坐标
f,axarr=plt.subplots(nrows=1,ncols=2,
sharex='col',
sharey='row',
figsize=(8,3))
#sharex只有最下行有x刻度,sharey只有最左列有y刻度
#row:每一行共享
#col:每一列共享
for idx,clf,tt in zip([0,1],
[tree,ada],['decision tree','Adaboost']):
clf.fit(X_train,y_train)
z=clf.predict(np.c_[xx.ravel(),yy.ravel()])#预测铺满图所有的点是y=0,还是y=1
z=z.reshape(xx.shape)
axarr[idx].contourf(xx,yy,z,alpha=0.3)
axarr[idx].scatter(X_train[y_train==0,0],
X_train[y_train==0,1],
color='blue',
marker='^',
s=50)
axarr[idx].scatter(X_train[y_train==1,0],
X_train[y_train==1,1],
color='green',
marker='o',
s=50)
axarr[idx].set_title(tt)
axarr[0].set_ylabel('Alcohol',fontsize=12)
plt.text(9.5,-2,s='OD280',ha='center',va='center',fontsize=12)
plt.show()
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)