第7章集成学习_python

7.2多投票集成学习

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#实现概率质量函数
from scipy.special import comb
import math

def ensemble_error(n_classifier,error):
    k_start=int(math.ceil(n_classifier/2))
    probs=[comb(n_classifier,k)*(error**k)*(1-error)**(n_classifier-k) 
          for k in range(k_start,n_classifier+1)]
    
    return sum(probs)

ensemble_error(n_classifier=11,error=0.25)

0.03432750701904297

#绘制集成错误率与基本分类器错误率关系

#得到集成错误率
error_range=np.arange(0.0,1.01,0.01)
ens_errors=[ensemble_error(n_classifier=11,error=err) for err in error_range]

#绘图
plt.plot(error_range,ens_errors,label='Ensemble erroe',linewidth=2)
plt.plot(error_range,error_range,linestyle='--',
       label='Base error',linewidth=2)
plt.xlabel('base error')
plt.ylabel('base/ensemble error')
plt.legend(loc='best')
plt.grid()
plt.show()

np.argmax(np.bincount([0,0,1],weights=[0.2,0.2,0.6]))

np.bincount([0,0,1],weights=[0.2,0.2,0.6])

array([0.4, 0.6])

#基于分类概率
ex=np.array([[0.9,0.1],
           [0.8,0.2],
           [0.4,0.6]])
ex

array([[0.9, 0.1],
       [0.8, 0.2],
       [0.4, 0.6]])

p=np.average(ex,axis=0,weights=[0.2,0.2,0.6])
p

array([0.58, 0.42])

#多数投票
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.preprocessing import LabelEncoder
import six
from sklearn.base import clone
from sklearn.pipeline import _name_estimators
import operator

class MajorityVoteClassifier(BaseEstimator,ClassifierMixin):

    """
  多数票集成分类器
  
  parameters
  -------------
  Classifiers:array,shape=[n_classifiers]
  
  vote:str{'classlabel ','probability'}
  Defult:'classlabel'
  if 'classlabel',prediction基于类别标签的%alias大值的索引
  if'probability',概率的总和的最大值被用来索引被用来预测标签
  
  weights:array,shape=[n_classifiers]
  optional,default:None
  ‘int’或者‘flaot’的列表被提供，那么分类器赋予权重，按照重要性。
  如果‘wights=None’,则权重均匀
  
    """
    
    def __init__(self,classifiers,vote='classlabel',weights=None):
        
        self.classifiers=classifiers
        self.named_classifiers={key: value for key,
                                value in _name_estimators(classifiers)}
        self.vote=vote
        self.weights=weights
        
    def fit(self,X,y):
        """
        fit classifiers
        
        parametes
        ------------
        X:array,shape=[n_samples,n_features]
        
        y:array,shape=[n_samples]
        
        returns
        ___________
        self:objects
        """
        self.lablenc_=LabelEncoder()#标签编码,从0开始
        self.lablenc_.fit(y)
        self.classes_=self.lablenc_.classes_
        self.classifiers_=[]
        for clf in self.classifiers:
            fitted_clf=clone(clf).fit(X,self.lablenc_.transform(y))#clone构建一个相同参数的calssifiers
            self.classifiers_.append(fitted_clf)#一些fitted的分类器
            
        return self
    
    def predict(self,X):
        
        """
        parametes
        ___________
        X:array,shape=[n_samples,n_features]
        
        returns
        ___________
        mai_vote:array,shape=[n_samples]
        预测的类的标签
        
        """
        if self.vote=='probability':
            maj_vote=np.argmax(self.predict_proba(X),axis=1)#返回最大值的索引
            
        else:
            predictions=np.asarray([clf.predict(X) for clf in self.classifiers_]).T
            #转置后，每行对应三个分类器对某个样本的预测类别
            
            maj_vote=np.apply_along_axis(lambda x:np.argmax(np.bincount(x,weights=self.weights)),
                                        axis=1,
                                        arr=predictions)
            
        maj_vote=self.lablenc_.inverse_transform(maj_vote)#反转化，获得原始标签
        return maj_vote
        
        
    def predict_proba(self,X):
        

        """
        parametes
         ___________
        X:array,shape=[n_samples,n_features]
        
        returns
        ------------
        avg_proda:array
        shape=[n_samples,n_classes]
        对于每个样本属于每个类别的加权平均概率

        """
        probas=np.asarray([clf.predict_proba(X) for clf in self.classifiers_])
        
        avg_proba=np.average(probas,axis=0,weights=self.weights)
        
        return avg_proba
    
    def get_params(self,deep=True):
        
        """
        为了网格搜索获得分类器参数
        """
        
        if not deep:
            return super(MajorityVoteClassifier,self).get_params(deep=False)  #调用父类的方法
        
        else:
            out=self.named_classifiers.copy()
            for name,step in six.iteritems(self.named_classifiers):
                for key,value in six.iteritems(step.get_params(deep=True)):
                    out['%s_%s'%(name,key)]=value
            return out

用多数票原则进行预测

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

#读取鸢尾花数据集
iris=datasets.load_iris()
X,y=iris.data[50:,[1,2]],iris.target[50:]

#转换标签
le=LabelEncoder()
y=le.fit_transform(y)

#划分数据集
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.5,random_state=1,
                                              stratify=y)

使用不同分类器：逻辑斯蒂分类器，决策树分类器，kNN分类器

##交叉验证
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

clf1=LogisticRegression(penalty='l2',
                       C=0.001,
                       random_state=1)
clf2=DecisionTreeClassifier(max_depth=1,
                           criterion='entropy',
                           random_state=0)
clf3=KNeighborsClassifier(n_neighbors=1,
                         p=2,
                         metric='minkowski')

pipeline与make_pipeline区别在make_pipeline会自动创建steps的名字

pipe1=Pipeline(steps=[('sc',StandardScaler()),
                      ('clf',clf1)])

决策树不需要标准化

pipe3=Pipeline(steps=[('sc',StandardScaler()),
                      ('clf',clf3)])

clf_labels=['Logistic regression','Decision tree','KNN']

print('10-fold cross validation:\n')
for clf,label in zip([pipe1,clf2,pipe3],clf_labels):
    scores=cross_val_score(estimator=clf,
                          X=X_train,
                          y=y_train,
                          cv=10,
                          scoring='roc_auc')
    print("ROC_AUC:%0.2f (+/- %0.2f) [%s]"
          %(scores.mean(),scores.std(),label))

10-fold cross validation:

ROC_AUC:0.92 (+/- 0.15) [Logistic regression]
ROC_AUC:0.87 (+/- 0.18) [Decision tree]
ROC_AUC:0.85 (+/- 0.13) [KNN]

使用MajorityVoteClassifier

mv_clf=MajorityVoteClassifier(classifiers=[pipe1,clf2,pipe3])

clf_labels+=['Majior voting']

all_clf=[pipe1,clf2,pipe3,mv_clf]

for clf,label in zip(all_clf,clf_labels):
    scores=cross_val_score(estimator=clf,
                          X=X_train,
                          y=y_train,
                          cv=10,
                          scoring='roc_auc')
    print("ROC_AUC: %0.2f (+/-%0.2f) [%s]"
          %(scores.mean(),scores.std(),label))

ROC_AUC: 0.92 (+/-0.15) [Logistic regression]
ROC_AUC: 0.87 (+/-0.18) [Decision tree]
ROC_AUC: 0.85 (+/-0.13) [KNN]
ROC_AUC: 0.98 (+/-0.05) [Majior voting]

评估和优化集成分类器

#对集成分类器评估
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

colors=['black','orange','blue','green']
linestyles=[':','--','-.','-']

for clf,label,clr,ls in zip (all_clf,clf_labels,colors,linestyles):
    #假定positive class 是1
    y_pred=clf.fit(X_train,y_train).predict_proba(X_test)[:,1]
    fpr,tpr,thresholds=roc_curve(y_true=y_test,y_score=y_pred)
    roc_auc=auc(x=fpr,y=tpr)
    plt.plot(fpr,tpr,
             color=clr,
            linestyle=ls,
            label="%s(auc=%0.2f)"%(label,roc_auc))
    
plt.legend(loc='lower right')
plt.grid(alpha=0.7)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.show()

#绘出分类器决策区域
sc=StandardScaler()
X_train_std=sc.fit_transform(X_train)#使决策树和其他模型有相同的比例尺实现可视化

from itertools import product
x_min=X_train_std[:,0].min()-1
x_max=X_train_std[:,0].max()+1
y_min=X_train_std[:,1].min()-1
y_max=X_train_std[:,1].max()+1

np.c 中的c 是 column(列)的缩写，就是按列叠加两个矩阵，就是把两个矩阵左右组合，要求行数相等。

xx,yy=np.meshgrid(np.arange(x_min,x_max,0.1),
                 np.arange(y_min,y_max,0.1))#获得坐标矩阵，xx储存横坐标，yy储存纵坐标

f,axarr=plt.subplots(nrows=2,ncols=2,
                    sharex='col',
                    sharey='row',
                    figsize=(7,5))
#sharex只有最下行有x刻度，sharey只有最左列有y刻度
#row:每一行共享
#col：每一列共享

for idx,clf,tt in zip(product([0,1],[0,1]),
                     all_clf,clf_labels):
    #product(list1,list2)从list1提取元素，与list2中的每个元素组成元组
    
    clf.fit(X_train_std,y_train)
    z=clf.predict(np.c_[xx.ravel(),yy.ravel()])#预测铺满图所有的点是y=0，还是y=1
    z=z.reshape(xx.shape)
    axarr[idx[0],idx[1]].contourf(xx,yy,z,alpha=0.3)
    axarr[idx[0],idx[1]].scatter(X_train_std[y_train==0,0],
                                X_train_std[y_train==0,1],
                                color='blue',
                                marker='^',
                                s=50)
    axarr[idx[0],idx[1]].scatter(X_train_std[y_train==1,0],
                                X_train_std[y_train==1,1],
                                color='green',
                                marker='o',
                                s=50)
    axarr[idx[0],idx[1]].set_title(tt)
    
plt.text(-3.5,-4.5,s='width',
        ha='center',va='center',
        fontsize=12)
plt.text(-13.5,4.5,
        s='petal length',
        ha='center',
        va='center',
        fontsize=12,
        rotation=90)
plt.show()

#调参
mv_clf.get_params()

{'pipeline-1': Pipeline(steps=[('sc', StandardScaler()),
                 ('clf', LogisticRegression(C=0.001, random_state=1))]),
 'decisiontreeclassifier': DecisionTreeClassifier(criterion='entropy', max_depth=1, random_state=0),
 'pipeline-2': Pipeline(steps=[('sc', StandardScaler()),
                 ('clf', KNeighborsClassifier(n_neighbors=1))]),
 'pipeline-1_memory': None,
 'pipeline-1_steps': [('sc', StandardScaler()),
  ('clf', LogisticRegression(C=0.001, random_state=1))],
 'pipeline-1_verbose': False,
 'pipeline-1_sc': StandardScaler(),
 'pipeline-1_clf': LogisticRegression(C=0.001, random_state=1),
 'pipeline-1_sc__copy': True,
 'pipeline-1_sc__with_mean': True,
 'pipeline-1_sc__with_std': True,
 'pipeline-1_clf__C': 0.001,
 'pipeline-1_clf__class_weight': None,
 'pipeline-1_clf__dual': False,
 'pipeline-1_clf__fit_intercept': True,
 'pipeline-1_clf__intercept_scaling': 1,
 'pipeline-1_clf__l1_ratio': None,
 'pipeline-1_clf__max_iter': 100,
 'pipeline-1_clf__multi_class': 'auto',
 'pipeline-1_clf__n_jobs': None,
 'pipeline-1_clf__penalty': 'l2',
 'pipeline-1_clf__random_state': 1,
 'pipeline-1_clf__solver': 'lbfgs',
 'pipeline-1_clf__tol': 0.0001,
 'pipeline-1_clf__verbose': 0,
 'pipeline-1_clf__warm_start': False,
 'decisiontreeclassifier_ccp_alpha': 0.0,
 'decisiontreeclassifier_class_weight': None,
 'decisiontreeclassifier_criterion': 'entropy',
 'decisiontreeclassifier_max_depth': 1,
 'decisiontreeclassifier_max_features': None,
 'decisiontreeclassifier_max_leaf_nodes': None,
 'decisiontreeclassifier_min_impurity_decrease': 0.0,
 'decisiontreeclassifier_min_samples_leaf': 1,
 'decisiontreeclassifier_min_samples_split': 2,
 'decisiontreeclassifier_min_weight_fraction_leaf': 0.0,
 'decisiontreeclassifier_random_state': 0,
 'decisiontreeclassifier_splitter': 'best',
 'pipeline-2_memory': None,
 'pipeline-2_steps': [('sc', StandardScaler()),
  ('clf', KNeighborsClassifier(n_neighbors=1))],
 'pipeline-2_verbose': False,
 'pipeline-2_sc': StandardScaler(),
 'pipeline-2_clf': KNeighborsClassifier(n_neighbors=1),
 'pipeline-2_sc__copy': True,
 'pipeline-2_sc__with_mean': True,
 'pipeline-2_sc__with_std': True,
 'pipeline-2_clf__algorithm': 'auto',
 'pipeline-2_clf__leaf_size': 30,
 'pipeline-2_clf__metric': 'minkowski',
 'pipeline-2_clf__metric_params': None,
 'pipeline-2_clf__n_jobs': None,
 'pipeline-2_clf__n_neighbors': 1,
 'pipeline-2_clf__p': 2,
 'pipeline-2_clf__weights': 'uniform'}

from sklearn.model_selection import GridSearchCV

#优化逻辑回顾和决策树参数
params={'decisiontreeclassifier_max_depth':[1,2],
        'pipeline-1_clf__C':[0.001,0.1,100.0]}

grid=GridSearchCV(estimator=mv_clf,
                 param_grid=params,
                 cv=10,
                 scoring='roc_auc')
grid.fit(X_train,y_train)

GridSearchCV(cv=10,
             estimator=MajorityVoteClassifier(classifiers=[Pipeline(steps=[('sc',
                                                                            StandardScaler()),
                                                                           ('clf',
                                                                            LogisticRegression(C=0.001,
                                                                                               random_state=1))]),
                                                           DecisionTreeClassifier(criterion='entropy',
                                                                                  max_depth=1,
                                                                                  random_state=0),
                                                           Pipeline(steps=[('sc',
                                                                            StandardScaler()),
                                                                           ('clf',
                                                                            KNeighborsClassifier(n_neighbors=1))])]),
             param_grid={'decisiontreeclassifier_max_depth': [1, 2],
                         'pipeline-1_clf__C': [0.001, 0.1, 100.0]},
             scoring='roc_auc')

grid.cv_results_

{'mean_fit_time': array([0.0076185 , 0.00660717, 0.00634389, 0.00440733, 0.00522645,
        0.00517545]),
 'std_fit_time': array([0.00748017, 0.00485841, 0.00090428, 0.00044422, 0.00073637,
        0.00052229]),
 'mean_score_time': array([0.00169656, 0.00185349, 0.00189853, 0.00183671, 0.00169687,
        0.00145319]),
 'std_score_time': array([0.000457  , 0.00053638, 0.00069667, 0.00033673, 0.00063198,
        0.00047215]),
 'param_decisiontreeclassifier_max_depth': masked_array(data=[1, 1, 1, 2, 2, 2],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_pipeline-1_clf__C': masked_array(data=[0.001, 0.1, 100.0, 0.001, 0.1, 100.0],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'decisiontreeclassifier_max_depth': 1,
   'pipeline-1_clf__C': 0.001},
  {'decisiontreeclassifier_max_depth': 1, 'pipeline-1_clf__C': 0.1},
  {'decisiontreeclassifier_max_depth': 1, 'pipeline-1_clf__C': 100.0},
  {'decisiontreeclassifier_max_depth': 2, 'pipeline-1_clf__C': 0.001},
  {'decisiontreeclassifier_max_depth': 2, 'pipeline-1_clf__C': 0.1},
  {'decisiontreeclassifier_max_depth': 2, 'pipeline-1_clf__C': 100.0}],
 'split0_test_score': array([1., 1., 1., 1., 1., 1.]),
 'split1_test_score': array([1., 1., 1., 1., 1., 1.]),
 'split2_test_score': array([1., 1., 1., 1., 1., 1.]),
 'split3_test_score': array([1., 1., 1., 1., 1., 1.]),
 'split4_test_score': array([0.83333333, 0.83333333, 0.66666667, 0.83333333, 0.83333333,
        0.66666667]),
 'split5_test_score': array([1., 1., 1., 1., 1., 1.]),
 'split6_test_score': array([1., 1., 1., 1., 1., 1.]),
 'split7_test_score': array([1., 1., 1., 1., 1., 1.]),
 'split8_test_score': array([1., 1., 1., 1., 1., 1.]),
 'split9_test_score': array([1., 1., 1., 1., 1., 1.]),
 'mean_test_score': array([0.98333333, 0.98333333, 0.96666667, 0.98333333, 0.98333333,
        0.96666667]),
 'std_test_score': array([0.05, 0.05, 0.1 , 0.05, 0.05, 0.1 ]),
 'rank_test_score': array([1, 1, 5, 1, 1, 5])}

#得到优化后的不同超参数组合
for i,_ in enumerate(grid.cv_results_['mean_test_score']):
    print("%0.3f +/- %0.2f %r"
          %(grid.cv_results_['mean_test_score'][i],
                             grid.cv_results_['std_test_score'][i],
                                              grid.cv_results_['params'][i]))

0.983 +/- 0.05 {'decisiontreeclassifier_max_depth': 1, 'pipeline-1_clf__C': 0.001}
0.983 +/- 0.05 {'decisiontreeclassifier_max_depth': 1, 'pipeline-1_clf__C': 0.1}
0.967 +/- 0.10 {'decisiontreeclassifier_max_depth': 1, 'pipeline-1_clf__C': 100.0}
0.983 +/- 0.05 {'decisiontreeclassifier_max_depth': 2, 'pipeline-1_clf__C': 0.001}
0.983 +/- 0.05 {'decisiontreeclassifier_max_depth': 2, 'pipeline-1_clf__C': 0.1}
0.967 +/- 0.10 {'decisiontreeclassifier_max_depth': 2, 'pipeline-1_clf__C': 100.0}

for _ in …理解

https://blog.csdn.net/hello_world_blog/article/details/89314816

print("best parameters: %s"%(grid.best_params_))

best parameters: {'decisiontreeclassifier_max_depth': 1, 'pipeline-1_clf__C': 0.001}

print('accuracy: %0.2f'%(grid.best_score_))

accuracy: 0.98

7.3套袋法：基于样本构建分类器集成

df_wine=pd.read_csv('wine-Copy1.data',header=None,names=['label',
                                                        'alcohol',
                                                        'malic acid',
                                                        'ash',
                                                        'alcalinity of ash',
                                                        'magnesium',
                                                        'total phenols',
                                                        'flavanids',
                                                        'nonflavanoid phenols',
                                                        'proanthocyanins',
                                                        'color intensity',
                                                        'hue',
                                                        'OD280',
                                                        'proline'])
df_wine.head()

	label	alcohol	malic acid	ash	alcalinity of ash	magnesium	total phenols	flavanids	nonflavanoid phenols	proanthocyanins	color intensity	hue	OD280	proline
0	1	14.23	1.71	2.43	15.6	127	2.80	3.06	0.28	2.29	5.64	1.04	3.92	1065
1	1	13.20	1.78	2.14	11.2	100	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050
2	1	13.16	2.36	2.67	18.6	101	2.80	3.24	0.30	2.81	5.68	1.03	3.17	1185
3	1	14.37	1.95	2.50	16.8	113	3.85	3.49	0.24	2.18	7.80	0.86	3.45	1480
4	1	13.24	2.59	2.87	21.0	118	2.80	2.69	0.39	1.82	4.32	1.04	2.93	735

#y选择2，3类
df_wine=df_wine[df_wine['label']!=1]

y=df_wine['label'].values
X=df_wine[['alcohol','OD280']].values

#分类标签编码
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=le.fit_transform(y)
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

#划分训练集和测试集
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,
                                               test_size=0.2,
                                               stratify=y,
                                              random_state=1)

#实现baggingClassifier算法
from sklearn.ensemble import BaggingClassifier
tree=DecisionTreeClassifier(criterion='entropy',
                           random_state=1,
                           max_depth=None)
#创建500棵决策树的组合模型b
bag=BaggingClassifier(base_estimator=tree,
                     n_estimators=500,
                     max_samples=1.0,#1.0浮点数
                     max_features=1.0,
                     bootstrap=True,
                     bootstrap_features=False,
                     n_jobs=1,
                     random_state=1)

#计算单一决策树在训练集和测试集的准确率
from sklearn.metrics import accuracy_score
tree.fit(X_train,y_train)
y_train_pred=tree.predict(X_train)
y_test_pred=tree.predict(X_test)
tree_train=accuracy_score(y_train,y_train_pred)#计算在训练集上的准确率
tree_test=accuracy_score(y_test,y_test_pred)#计算在测试集上的准确率
print("decsion tree train/test accuracy %0.3f/%0.3f"%(tree_train,tree_test))#overfitting

decsion tree train/test accuracy 1.000/0.833

#计算bagging集成方法在训练集和测试集上的准确率
bag=bag.fit(X_train,y_train)
y_train_pred=bag.predict(X_train)
y_test_pred=bag.predict(X_test)
bag_train=accuracy_score(y_train,y_train_pred)
bag_test=accuracy_score(y_test,y_test_pred)
print('bagging train/set accuracy: %0.3f/%0.3f'%(bag_train,bag_test))

bagging train/set accuracy: 1.000/0.917

#训练集上决策区域可视化
x_min=X_train[:,0].min()-1
x_max=X_train[:,0].max()+1
y_min=X_train[:,1].min()-2
y_max=X_train[:,1].max()+2



xx,yy=np.meshgrid(np.arange(x_min,x_max,0.1),
                 np.arange(y_min,y_max,0.1))#获得坐标矩阵，xx储存横坐标，yy储存纵坐标

f,axarr=plt.subplots(nrows=1,ncols=2,
                    sharex='col',
                    sharey='row',
                    figsize=(8,3))
#sharex只有最下行有x刻度，sharey只有最左列有y刻度
#row:每一行共享
#col：每一列共享

for idx,clf,tt in zip([0,1],
                     [tree,bag],['decision tree','bagging']):
    #product(list1,list2)从list1提取元素，与list2中的每个元素组成元组
    
    clf.fit(X_train,y_train)
    z=clf.predict(np.c_[xx.ravel(),yy.ravel()])#预测铺满图所有的点是y=0，还是y=1
    z=z.reshape(xx.shape)
    axarr[idx].contourf(xx,yy,z,alpha=0.3)
    axarr[idx].scatter(X_train[y_train==0,0],
                                X_train[y_train==0,1],
                                color='blue',
                                marker='^',
                                s=50)
    axarr[idx].scatter(X_train[y_train==1,0],
                                X_train[y_train==1,1],
                                color='green',
                                marker='o',
                                s=50)
    axarr[idx].set_title(tt)

axarr[0].set_ylabel('Alcohol',fontsize=12)
plt.text(9.5,-2,s='OD280',ha='center',va='center',fontsize=12)

plt.show()

7.4通过AdaBoost集成

#数据：使用葡萄酒数据

from sklearn.ensemble import AdaBoostClassifier
tree=DecisionTreeClassifier(criterion='entropy',
                           random_state=1,
                           max_depth=1)
ada=AdaBoostClassifier(base_estimator=tree,
                      n_estimators=500,
                      learning_rate=0.1,
                      random_state=1)#集成500课决策树

tree.fit(X_train,y_train)
y_train_pred=tree.predict(X_train)
y_test_pred=tree.predict(X_test)

tree_train=accuracy_score(y_train,y_train_pred)
tree_test=accuracy_score(y_test,y_test_pred)

print("Decison Tree train/test accuracy: %0.3f/%0.3f"%(tree_train,tree_test))

Decison Tree train/test accuracy: 0.916/0.875

#学习曲线判断欠拟合还是过拟合
import matplotlib.pyplot as plt
from  sklearn.model_selection import learning_curve

train_sizes,train_scores,test_scores=learning_curve(estimator=tree,
              X=X_train,
              y=y_train,
              train_sizes=np.linspace(0.1,1,10),
              cv=10,
              n_jobs=1)

train_mean=np.mean(train_scores,axis=1)
test_mean=np.mean(test_scores,axis=1)

plt.plot(train_sizes,train_mean,color='g',marker='o',
         label='training accuracy')
plt.plot(train_sizes,test_mean,color='r',marker='^',
         label='validation accuracy')

plt.legend(loc='best')
plt.show()

欠拟合:train和val曲线并没有平滑，还在上下波动，也就是模型并没有收敛，如果模型收敛的话，曲线末端应该是接近平滑的直线的

一般训练集上的准确度大于0.9，验证集或测试集上的准确度小于0.9很多，就是过拟合了。

训练集上的准确度小于0.9很多，验证集或测试集上的准确度也小于0.9很多，就是欠拟合。

#使用ada算法
ada.fit(X_train,y_train)

y_train_pred=ada.predict(X_train)
y_test_pred=ada.predict(X_test)

ada_train=accuracy_score(y_train,y_train_pred)
ada_test=accuracy_score(y_test,y_test_pred)

print("Adaboost train/test accuracy:%0.3f/%0.3f"%(ada_train,ada_test))

Adaboost train/test accuracy:1.000/0.917

train_sizes1,train_scores1,test_scores1=learning_curve(estimator=ada,
              X=X_train,
              y=y_train,
              train_sizes=np.linspace(0.1,1,10),
              cv=10,
              n_jobs=1)

train_mean1=np.mean(train_scores1,axis=1)
test_mean1=np.mean(test_scores1,axis=1)

plt.plot(train_sizes1,train_mean1,color='k',marker='o',
        label='training accuracy')
plt.plot(train_sizes1,test_mean1,color='b',marker='^',
        label='validation accuracy')

plt.title('Adaboost learning curve')
plt.xlabel('numbers of samples')
plt.ylabel('accuracy')
plt.legend(loc='best')
plt.show()

感觉有些欠拟合

#训练集上决策区域可视化
x_min=X_train[:,0].min()-1
x_max=X_train[:,0].max()+1
y_min=X_train[:,1].min()-2
y_max=X_train[:,1].max()+2



xx,yy=np.meshgrid(np.arange(x_min,x_max,0.1),
                 np.arange(y_min,y_max,0.1))#获得坐标矩阵，xx储存横坐标，yy储存纵坐标

f,axarr=plt.subplots(nrows=1,ncols=2,
                    sharex='col',
                    sharey='row',
                    figsize=(8,3))
#sharex只有最下行有x刻度，sharey只有最左列有y刻度
#row:每一行共享
#col：每一列共享

for idx,clf,tt in zip([0,1],
                     [tree,ada],['decision tree','Adaboost']):
  
    
    clf.fit(X_train,y_train)
    z=clf.predict(np.c_[xx.ravel(),yy.ravel()])#预测铺满图所有的点是y=0，还是y=1
    z=z.reshape(xx.shape)
    axarr[idx].contourf(xx,yy,z,alpha=0.3)
    axarr[idx].scatter(X_train[y_train==0,0],
                                X_train[y_train==0,1],
                                color='blue',
                                marker='^',
                                s=50)
    axarr[idx].scatter(X_train[y_train==1,0],
                                X_train[y_train==1,1],
                                color='green',
                                marker='o',
                                s=50)
    axarr[idx].set_title(tt)

axarr[0].set_ylabel('Alcohol',fontsize=12)
plt.text(9.5,-2,s='OD280',ha='center',va='center',fontsize=12)

plt.show()

欢迎分享，转载请注明来源：内存溢出

原文地址: http://outofmemory.cn/langs/923426.html

第7章集成学习

发表评论

评论列表（0条）

第7章 集成学习

发表评论

评论列表（0条）

第7章集成学习