基于Python-Sklearn库的机器学习实战

基于Python-Sklearn库的机器学习实战,第1张

理论帖很多,实战帖找起来实在不方便,自学整理了一些常用的机器学习算法和常用方法(网格,k折,插值,可视化等),仅供参考 

预处理;https://github.com/xushige/Machine-Learning--Sklearn/tree/main/Data_Preprocess

from sklearn.impute import SimpleImputer
import pandas as pd

'''该文件不可运行 仅提供各种预处理方法的使用'''

data = 0
# 缺值填充
# strategy: 按列计算mean, median(中位数), most_frequent(众数), constant进行填充. defalut: mean
# fill_value: constant 可用,表示定值填充. default: None
si = SimpleImputer()
data = si.fit_transform(data) #data.shape==(n, m), n个样本,m维特征

# 分类文字型数据编码
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le = le.fit(data) # data一维数组
label = le.transform(data)
'''也可以直接label = le.fit_transform(data)'''
data.iloc[:, -1] = LabelEncoder().fit_transform(data.iloc[:, -1]) # 实际对DataFrame最后一列进行编码 *** 作

# 将分类特征转换为分类数值
from sklearn.preprocessing import OrdinalEncoder
data.iloc[:, :] = OrdinalEncoder().fit_transform(data.iloc[:, :]) # 相比LabelEncoder可以编码多列分类数据

# 将分类特征通过OrdinalEncoder直接变成0,1,2忽略了数字本身的关联性
# 因此使用哑变量更能准确刻画并行无关联的分类特征
from sklearn.preprocessing import OneHotEncoder
result = OneHotEncoder(categories='auto').fit_transform(data.iloc[:, :])
# 这里result是对指定列进行读热码转换,一列变为五列,因此需要将原列删除,将读热哑变量进行concat
data.drop(['column1', 'column2'])
data = pd.concat([data, pd.DataFrame(result)], axis=1) # 按列合并
data.columns = ['new_column0', 'new_column1', 'new_column2']

# 连续性特征根据阈值二值化
from sklearn.preprocessing import Binarizer
data.iloc[:, 0:1] = Binarizer(threshold=30).fit_transform(data.iloc[:, 0:1]) #不能使用一维数组

 

决策树;https://github.com/xushige/Machine-Learning--Sklearn/tree/main/Decision%20Tree

import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

data = pd.read_csv('titanic_data/train.csv')
print(data.info(), data.head(10))

#删除无影响的特征
data.drop(['Cabin', 'Name', 'Ticket'], axis=1, inplace=True)
print(data.info())

#填补大量年龄缺失
data['Age'].fillna(data['Age'].mean(), inplace=True)
print(data.info())

#数据对齐,木桶原理,按最短的来,去除含nan的行/列
data.dropna(axis=0, inplace=True)
print(data.info())

#将object类型转换为int标签型, unique后是数组形式object型标签
transform = [data['Embarked'], data['Sex']]
for eachitem in transform:
    eachitem.replace(eachitem.unique(), [i for i in range(len(eachitem.unique()))], inplace=True)
print(data.info())

#取出y标签,得到x数据
y = data['Survived']
data.drop('Survived', axis=1, inplace=True)
print(data.head(10), '\n', y)

#交叉验证则不需要切分数据集
X_train, X_test, Y_train, Y_test = train_test_split(data, y, test_size=0.3)

#网格搜索:枚举试验出得分最高的参数组合,填入实例化的模型中
parameters = {
    'criterion': ('gini', 'entropy'),
    'max_depth': [*range(1, 5)],
    'min_samples_leaf': [*range(1, 10, 2)],
    'min_impurity_decrease': np.arange(0, 0.5, 0.1)
}
clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=3, min_impurity_decrease=0)
Grid_Search = GridSearchCV(estimator=clf, param_grid=parameters, cv=10).fit(data, y)
print(Grid_Search.best_params_, Grid_Search.best_score_)

降维;https://github.com/xushige/Machine-Learning--Sklearn/tree/main/Decomposition

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris

'''降维算法一定会带来信息损失'''

iris = load_iris()
x = iris['data']
y = iris['target']
print(x.shape, y.shape) # 四维数据

'''n_components: 降到某维'''
pca = PCA(n_components=2)
pca = pca.fit(x)
x_pca = pca.transform(x)
print(x_pca.shape)

'''PCA降维, 可视化数据分布'''
for classes in range(len(set(y))):
    plt.scatter(x_pca[y==classes, 0], x_pca[y==classes, 1], label=iris['target_names'][classes])
plt.legend()
plt.title('PCA of IRIS dataset')
plt.savefig('PCA_visualization')
plt.clf()

'''explained_variance_: 可解释方差, 方差越大, 信息越丰富, 对于分类也越有效'''
explained_variance = pca.explained_variance_
explained_variance_ratio = pca.explained_variance_ratio_
print(explained_variance, explained_variance_ratio)
print('信息保留率:%f' % (explained_variance_ratio.sum()))

'''可视化n_components--explained_variance_ratios曲线, 手肘法选取n值'''
explained_variance_ratios = []
for n in range(1, 5):
    pca = PCA(n)
    pca = pca.fit(x)
    explained_variance_ratios.append(pca.explained_variance_ratio_.sum())
plt.plot([*range(1, 5)], explained_variance_ratios)
plt.xticks([*range(1, 5)])
plt.xlabel('n-components')
plt.ylabel('explained_variance_ratio')
plt.grid(axis='y')
plt.savefig('n-componnets selection')
plt.clf()

Kmeans;https://github.com/xushige/Machine-Learning--Sklearn/tree/main/K_means

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score # 评估轮廓系数, 检测聚类效果
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

## 聚类思想:簇内差异小,簇外差异大

'''
创建数据集
n_samples: 数据量
n_features: 特征维度
centers: 类别数
'''
x, y =  make_blobs(n_samples=500, n_features=2, centers=4, random_state=1)

'''可视化原始数据分布'''
fig, ax1 = plt.subplots(1) # 画布上仅有一个子图
ax1.scatter(x[:, 0], x[:, 1], marker='o')
plt.savefig('data_distribution')
plt.clf()

'''可视化聚类效果'''
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(x)
y_pred = kmeans.labels_ # 训练数据的预测标签
centers = kmeans.cluster_centers_ # 簇中心坐标
for each in range(n_clusters):
    plt.scatter(x[y_pred==each, 0], x[y_pred==each, 1], marker='o', label='class:%d'%each)
plt.legend()
plt.savefig('kmeans_vis')
plt.clf() 

'''轮廓系数'''
score = silhouette_score(x, y_pred)
print('轮廓系数为:【%.4f】'%score)

'''基于轮廓系数选择k值'''
res = []
for k in range(2, 10):
    model = KMeans(k)
    model.fit(x)
    y_pred = model.labels_
    res.append(silhouette_score(x, y_pred))
plt.plot([*range(2, 10)], res, marker='o')
plt.xlabel('K value selection')
plt.ylabel('silhouette_score')
plt.savefig('K_value_selection')
plt.clf()

线性回归;https://github.com/xushige/Machine-Learning--Sklearn/tree/main/Linear%20regression

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedShuffleSplit, ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_wine
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

'''数据预处理(加载,归一化,切分)'''
data = load_wine()
x, y = data['data'], data['target']
print('data shape: %s     label shape: %s' % (x.shape, y.shape))
feature_names = data['feature_names']
x = pd.DataFrame(x, columns=feature_names)
x = StandardScaler().fit_transform(x)
trainx, testx, trainy, testy = train_test_split(x, y, test_size=0.3, shuffle=True)

'''回归模型建立'''
# fit_intercept: 是否计算截距,默认false
# normalize:按列进行归一化,相当于StandardScaler,默认false
# copy_X:在X.copy()上进行计算,默认True
# n_jobs:用于计算的作业数,如果输入-1,表示使用全部cpu计算,默认None
linear = LinearRegression(fit_intercept=True).fit(trainx, trainy)
score = linear.score(testx, testy)
coef = sorted(list(zip(feature_names, linear.coef_)), key=lambda x:x[1], reverse=True)
intercept = linear.intercept_
print('====================================================================================================')
print("When split-style is train/test==7:3, the score of linearregression is: [%.4f]\nThe intercept is: [%.4f]\nImportance condition:\n%s"%(score, intercept, coef))
print('====================================================================================================')
cross_score = cross_val_score(LinearRegression(), x, y, cv=ShuffleSplit(5, test_size=0.3))
print("When split-style is 5-fold, the score of linearregression is: [%.4f]"%(cross_score.mean()))
print('====================================================================================================')

'''回归模型评价指标(是否预测正确【precision】,是否拟合足够多的信息【recall】)'''
# 是否预测正确:【precision】
# Mean Squared Error(MSE): 均方误差, PS:【sklearn中MSE始终为负,表示损失loss】
print("MSE SCORE:")
mse_score = mean_squared_error(linear.predict(testx), testy)
print('When split-style is train/test==7:3, MSE score is: %.4f'%(mse_score))
cross_mse_score = cross_val_score(LinearRegression(), x, y, cv=ShuffleSplit(5, test_size=0.3), scoring='neg_mean_squared_error')
print("When split-style is 5-fold, the MSE core of linearregression is: [%.4f]"%(cross_mse_score.mean())) 
print('====================================================================================================')

# 是否拟合足够多信息【recall】
# 1. R2_score: 直接调用model.score默认为r2score
print("R2 SCORE:")
print('When split-style is train/test==7:3, R2 score is: %.4f'%(score))
cross_r2_score = cross_val_score(LinearRegression(), x, y, cv=ShuffleSplit(5, test_size=0.3), scoring='r2')
print("When split-style is 5-fold, the R2 core of linearregression is: [%.4f]"%(cross_r2_score.mean())) 
print('====================================================================================================')

# 可解释性方差
evs = explained_variance_score(testy, linear.predict(testx))
print('EVS of Linear Regression: %.4f'%(evs))

'''可视化'''
plt.plot(testy, label='True')
plt.plot(linear.predict(testx), label='Pred')
plt.legend()
plt.ylim(-4, 6)
plt.title('Linear Regression Effect')
plt.savefig('linear_regression_effect')
plt.clf()

逻辑回归;https://github.com/xushige/Machine-Learning--Sklearn/tree/main/Logistic%20regression

from sklearn import linear_model
from sklearn.datasets import load_breast_cancer
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

## 逻辑回归:尤其适合线性关系的拟合。
## y = sigmoid(wx)

'''数据集'''
data = load_breast_cancer()
x = data['data']
y = data['target']
print(x.shape, y.shape)

'''
逻辑回归模型
penalty: ['l1', 'l2']  default: 'l2'
C: 正则化强度的的倒数, (0, 1)的浮点数, C越小, 正则化强度越大
'''
lr1 = linear_model.LogisticRegression(penalty='l1', solver='liblinear', C=0.5, max_iter=1000)
lr2 = linear_model.LogisticRegression(penalty='l2', solver='liblinear', C=0.5, max_iter=1000)
lr1.fit(x, y)
lr2.fit(x, y)
# 查看每个特征的重要性, L1正则化会使得某些特征重要性变为0,相当于降维;而L2只会趋向于0
print(lr1.coef_.shape, lr2.coef_.shape)  

'''切分数据集'''
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, shuffle=True)

'''C值选取'''
l1, l2 = [], []
for i in range(5, 101, 5):
    c = i/100
    lr1 = linear_model.LogisticRegression(penalty='l1', solver='liblinear', C=c, max_iter=1000)
    lr1.fit(xtrain, ytrain)
    acc1 = (lr1.predict(xtest)==ytest).sum() / len(ytest)
    l1.append(acc1)
    
    lr2 = linear_model.LogisticRegression(penalty='l2', solver='liblinear', C=c, max_iter=1000)
    lr2.fit(xtrain, ytrain)
    acc2 = (lr2.predict(xtest)==ytest).sum() / len(ytest)
    l2.append(acc2)
plt.plot([i/100 for i in range(5, 101, 5)], l1, label='LR-L1')
plt.plot([i/100 for i in range(5, 101, 5)], l2, label='LR-L2')
plt.legend()
plt.xlabel('C value')
plt.title('c value selection')
plt.ylabel('Test, Accuracy')
plt.savefig('C_value_selection')
plt.clf()    

朴素贝叶斯;https://github.com/xushige/Machine-Learning--Sklearn/tree/main/Naive%20Bayes

from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix 
import matplotlib.pyplot as plt
'''数据集载入,标准化,切分'''
data = load_digits()
x, y = data['data'], data['target']
x = StandardScaler().fit_transform(x)
print('data shape: %s     label shape: %s' % (x.shape, y.shape))
trainx, testx, trainy, testy = train_test_split(x, y, test_size=0.3, shuffle=True)

'''朴素贝叶斯建模'''
# prior: y长度为类别数的一维数组形式,表示类别的先验概率,即P(Y=yi),不指定则自行根据数据计算.默认None
# var_smoothing: 浮点数,让计算平稳。默认1e-9
nb = GaussianNB().fit(trainx, trainy)
# R2-score
score = nb.score(testx, testy)
print('predict R2 score is: %.4f'%(score))
# 概率
prob = nb.predict_proba(testx)
print('probability shape: %s----表示每个样本经过最大后验估计后对应每个label的概率'%str(prob.shape))

'''混淆矩阵可视化'''
cm = confusion_matrix(testy, nb.predict(testx))
print('=======================================\nconfusion matrix:\n%s\n======================================='%(cm))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(i-0.2, j+0.2, str(cm[j][i]), color='white' if cm[j][i]>35 else 'black')
plt.title('confusion matrix')
plt.xticks([*range(prob.shape[1])])
plt.yticks([*range(prob.shape[1])])
plt.xlabel('Pred Label')
plt.ylabel('True Label')
plt.tight_layout()
plt.savefig('confusion matrix')

随机森林;https://github.com/xushige/Machine-Learning--Sklearn/tree/main/Random%20Forest

from sklearn.datasets import load_wine
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

wine_data = load_wine()
Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine_data.data, wine_data.target, test_size=0.3)
# 实例化,还没有生成树,fit后生成决策树。
# 随机森林中random_state形成了25个固定random_state的树,并不是25个一样random_state的树
# bootstrap:n个样本
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=3, random_state=0)
rfc = RandomForestClassifier(n_estimators=35, criterion='entropy', max_depth=4, min_samples_leaf=1, random_state=0)


def normal_compare():
    global clf, rfc
    clf = clf.fit(Xtrain, Ytrain)
    rfc = rfc.fit(Xtrain, Ytrain)

    score_tree = clf.score(Xtest, Ytest)
    score_forest = rfc.score(Xtest, Ytest)
    print('Tree normal test socre: 【%f】, Forest normal test score:【%f】' % (score_tree, score_forest))

def cross_val():
    global clf, rfc
    cross_score_tree = cross_val_score(clf, wine_data.data, wine_data.target, cv=10).mean()
    cross_score_forest = cross_val_score(rfc, wine_data.data, wine_data.target, cv=10).mean()
    print('Tree corss_val_test socre:【%f】, Forest cross_val_test score:【%f】' % (cross_score_tree, cross_score_forest))

def forest_grid_search():
    parameters = {
        'n_estimators': [*range(10, 50, 5)],
        'max_depth': [*range(1, 6)],
        'min_samples_leaf': [*range(1, 10, 2)],
        'criterion': ['entropy', 'gini']
    }
    GS = GridSearchCV(rfc, param_grid=parameters, cv=10).fit(wine_data.data, wine_data.target)
    print('Forest best params: %s\n Forest best score:【%f】' % (GS.best_params_, GS.best_score_))

def tree_grid_search():
    parameters = {
        'max_depth': [*range(1, 6)],
        'min_samples_leaf': [*range(1, 10, 2)],
        'criterion': ['entropy', 'gini']
    }
    GS = GridSearchCV(clf, param_grid=parameters, cv=10).fit(wine_data.data, wine_data.target)
    print('Tree best params: %s\nTree best score:【%f】' % (GS.best_params_, GS.best_score_))

normal_compare()
cross_val()
from numpy import random
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
boston = load_boston()

x, _, y, _ = train_test_split(boston['data'], boston['target'], shuffle=True, test_size=1)

'''制造缺值'''
missing_rate = 0.5
n_miss_samples = int(x.shape[0]*x.shape[1]*missing_rate)
rng = np.random.RandomState(0) #固定种子,正常用np.random.randint就可以
missing_features = rng.randint(0, x.shape[1], n_miss_samples)
missing_samples = rng.randint(0, x.shape[0], n_miss_samples)
x[missing_samples, missing_features] = np.nan #对应索引位置变为nan
x = pd.DataFrame(x)
print(x)

def mean_fill(x):
    '''均值填充缺值'''
    #实例化,将np.nan以均值进行填充,如果strategy是constant,通过fill_value赋值
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean', fill_value=0) 
    x_fill = imp_mean.fit_transform(x) #变换
    print(pd.DataFrame(x_fill).isna().sum()) #查看是否还有nan值
    rfr = RandomForestRegressor()
    score = cross_val_score(rfr, x_fill, y, cv=10)
    print(score)

def randomforest_fill(x):
    '''随机森林填充缺值(实质是预测缺值)'''
    sortidx = np.argsort(x.isnull().sum(axis=0)).values #按缺失值数量小到大对特征排序
    print(sortidx)
    x = x.to_numpy()
    total_idx = [i for i in range(x.shape[1])]
    print(x.shape)
    for idx in sortidx:
        temp = total_idx[:]
        temp.pop(idx)
        print(idx, temp)
        leave = x[:, temp]
        select = x[:, idx]
        print(leave.shape, select.shape)
        quit()
randomforest_fill(x)

支撑向量机;https://github.com/xushige/Machine-Learning--Sklearn/tree/main/SVM

from cgi import test
from sklearn.datasets import make_blobs, make_circles, load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np
import time

'''SVM二分类可视化'''
def svm_vis(x, y, filename, kernal='linear'):
    print('x_size: %s, y_size:%s'%(x.shape, y.shape))
    plt.scatter(x[:, 0], x[:, 1], c=y, s=50, cmap='rainbow')
    ax = plt.gca()
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    axisx = np.linspace(xlim[0], xlim[1], 30)
    axisy = np.linspace(ylim[0], ylim[1], 30)
    axisx, axisy = np.meshgrid(axisx, axisy)
    xy = np.vstack([axisx.ravel(), axisy.ravel()]).T
    clf = SVC(kernel=kernal).fit(x, y)
    p = clf._decision_function(xy).reshape(axisx.shape)
    ax.contour(axisx, axisy, p, levels=[-1, 0, 1], linestyles=['--', '-', '--'], colors='black')
    plt.savefig(filename)
    plt.clf()

# 线性数据分类
x, y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.6)
svm_vis(x, y, 'linear_clf', 'linear') # 线性核可分

# 环形数据分类
x, y = make_circles(n_samples=100, factor=0.1, noise=0.1)
svm_vis(x, y, 'circle_clf_linear', 'linear') # 线性核不可分
svm_vis(x, y, 'circle_clf_rbf', 'rbf') # 使用rbf高斯径向基核处理线性不可分数据


'''乳腺癌数据集svm分类'''
data = load_breast_cancer()
x, y = data['data'], data['target']
x = StandardScaler().fit_transform(x)
print('乳腺癌数据集详情:x_size: %s, y_size:%s'%(x.shape, y.shape))

# 降维可视化数据集分布
pca_xdata = PCA(2).fit_transform(x)
color = ['r', 'b']
plt.scatter(pca_xdata[:, 0][y==0], pca_xdata[:, 1][y==0])
plt.scatter(pca_xdata[:, 0][y==1], pca_xdata[:, 1][y==1])
plt.savefig("breast_dataset_vis")
plt.clf()

# 核函数性能探究
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, shuffle=True)
kernels = ['linear', 'poly', 'rbf', 'sigmoid'] # 线性核,高斯核,sigmoid核
for kernel in kernels:
    start = time.time()
    # cahce-size表示分配多少内存进行计算,越大越快
    # degree:表示多项式核的次幂,degree=1时就是线性核(仅多项式核poly可调)
    # gamma: {'scale', 'auto'} or float, default='scale'。(rbf,poly,sigmoid可调)
    # coef0: float, default=0.0。(poly, sigmoid可调)
    # C值:float, default=1,惩罚系数,C越大,惩罚越大,分类精确
    clf = SVC(kernel=kernel, gamma='auto', cache_size=5000, degree=1) 
    clf.fit(xtrain, ytrain)
    timespan = time.time() - start
    score = clf.score(xtest, ytest)
    print("【%s】 kernel's score is: %f, time-consumption is: %fs"%(kernel, score, timespan))
    '''
    【linear】 kernel's score is: 0.970760, time-consumption is: 0.002428s
    【rbf】 kernel's score is: 0.959064, time-consumption is: 0.003242s
    【sigmoid】 kernel's score is: 0.959064, time-consumption is: 0.002081s
    从结果看,线性核效果较好,高斯核效果差很多,可能是量纲差距过大,或者过拟合
    尝试标准化后高斯核效果与线性核差不多
    【linear】 kernel's score is: 0.976608, time-consumption is: 0.001769s
    【poly】 kernel's score is: 0.959064, time-consumption is: 0.001492s
    【rbf】 kernel's score is: 0.976608, time-consumption is: 0.002371s
    【sigmoid】 kernel's score is: 0.964912, time-consumption is: 0.002023s
    '''
# 对poly核进行网格搜索选择gamma,coef0参数
cv = StratifiedShuffleSplit(5, test_size=0.3)
param_grid = {'gamma': np.logspace(-10, 1, 20), 'coef0':np.linspace(0, 5, 10)}
grid = GridSearchCV(SVC(cache_size=5000, degree=1, kernel='poly'), param_grid=param_grid, cv=cv).fit(x, y)
print('To "poly" kernel, The best parameters are [%s], score is [%f]' % (grid.best_params_, grid.best_score_))

# C值选取曲线
score = []
crange = np.linspace(0.01, 30, 50, dtype=np.float16)
for c in crange:
    svm = SVC(C=c, cache_size=5000, kernel='linear').fit(xtrain, ytrain)
    score.append(svm.score(xtest, ytest))
bestsocre = max(score)
bestc = crange[score.index(bestsocre)]
print('max score: %.4f, c value: %.4f'%(bestsocre, bestc))
plt.plot(crange, score)
plt.scatter([bestc], [bestsocre], c='r', s=40)
plt.title('C value selection')
plt.xlabel('C value')
plt.ylabel('score')
plt.xticks(crange, rotation=-270, fontsize=4)
plt.grid()
plt.rcParams['savefig.dpi'] = 400  # 图片像素
plt.rcParams['figure.dpi'] = 400  # 分辨率
plt.savefig('Cvalue_selection')
plt.clf()

XGBoost;https://github.com/xushige/Machine-Learning--Sklearn/tree/main/XGBoost

'''XGBoost'''

import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston
from  sklearn.model_selection import GridSearchCV, cross_val_score, KFold, train_test_split, learning_curve
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


'''
XgbBoost与sklearn实 *** 有所区别:

xgb.Dmatrix()              # 数据读取
param = {}                 # 参数设置
model = xgb.train(param)   # 训练
model.predict()            # 预测
'''

data = load_boston()
x, y = data['data'], data['target']
x = StandardScaler().fit_transform(x)
print('data.shaep: %s     label.shape: %s' % (x.shape, y.shape))

print('===============================Start train-test-split===============================')
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, shuffle=True)
xgb_model = xgb.XGBRegressor().fit(xtrain, ytrain)
pre = xgb_model.predict(xtest)
score = xgb_model.score(xtest, ytest) # R2_score
print('XGBoost\'s R2_score in Boston dataset : [%.4f]'%(score))

mse_error = mean_squared_error(ytest, pre)
print('XGBoost\'s MSE error in Boston dataset : [%.4f]'%(mse_error))

feature_importance = sorted(zip(xgb_model.feature_importances_, data['feature_names']), reverse=True)
print('\nTree model中可以调用feature importance进行特征选择,如下:\n\n%s\n'%feature_importance)

print('===============================Start Cross-val===============================')
cross_score = cross_val_score(xgb.XGBRegressor(), x, y, cv=KFold(5, shuffle=True)) # R2_score与model.score保持一致,回归模型:R2-score;分类模型:Accuracy
cross_score_rf = cross_val_score(RandomForestRegressor(), x, y, cv=KFold(5, shuffle=True)) # random forest
cross_score_linear = cross_val_score(LinearRegression(), x, y, cv=KFold(5, shuffle=True)) # linear regression
print('R2_score metrics in Boston dataset\n     XGBoost : [%.4f]\n     Random Forest: [%.4f]\n     Linear Regression : [%.4f]'%(cross_score.mean(), cross_score_rf.mean(), cross_score_linear.mean()))


print('===============================Start Visulization===============================')
def plot_learning_curve(estimator, title, x, y, cv=None):
    train_sizes, train_scores, test_scores = learning_curve(estimator, x, y, shuffle=True, cv=cv)
    plt.plot(train_sizes, train_scores.mean(axis=1), 'o-', color='r', label='Training Score', linewidth=3)
    plt.plot(train_sizes, test_scores.mean(axis=1), 'o-', color='black', label='Test Score', linewidth=3)
    plt.legend()
    plt.title(title)
    plt.xlabel('Training Samples')
    plt.ylabel('Score')
    plt.grid()
    plt.savefig(title)
    plt.clf()
plot_learning_curve(xgb.XGBRegressor(n_estimators=100), 'XGBoost_learning_curve', x, y, cv=KFold(5, shuffle=True))

def nestimator_selection_vis(nestimators=None, subsamples=None, etas=None, xlabel=None):
    if nestimators != None:
        params = nestimators
    elif subsamples != None:
        params = subsamples
    elif etas != None:
        params = etas
    scores = [] # 得分
    vars = [] # 方差
    ges = [] # 泛化误差
    for param in params:
        if isinstance(nestimators, list):
            model = xgb.XGBRegressor(n_estimators = param)
        elif isinstance(subsamples, list):
            model = xgb.XGBRegressor(subsample = param)
        elif isinstance(etas, list):
            model = xgb.XGBRegressor(eta = param)
        result = cross_val_score(model, x, y, cv=KFold(5, shuffle=True))
        scores.append(result.mean())
        vars.append(result.var())
        ges.append((1-result.mean())**2 + result.var())
    
    idx_maxscore = scores.index(max(scores))
    idx_minvar = vars.index(min(vars))
    idx_minge = ges.index(min(ges))

    print('按照 最大 R2_score 来挑选,此时%s: 【%f】  score: %.4f  var: %.4f  ge: %.4f'%(xlabel, params[idx_maxscore], scores[idx_maxscore], vars[idx_maxscore], ges[idx_maxscore]))
    print('按照 最小 Variance 来挑选,此时%s: 【%f】  score: %.4f  var: %.4f  ge: %.4f'%(xlabel, params[idx_minvar], scores[idx_minvar], vars[idx_minvar], ges[idx_minvar]))
    print('按照 最小 Generalization error 来挑选,此时%s: 【%f】  score: %.4f  var: %.4f  ge: %.4f\n'%(xlabel, params[idx_minge], scores[idx_minge], vars[idx_minge], ges[idx_minge]))
    plt.plot(params, scores, 'o-', linewidth=2, label='score', color='#CB181B')
    plt.fill_between(params, np.array(scores)-np.array(vars), np.array(scores)+np.array(vars), color='#CB181B', alpha=0.3)
    plt.legend()
    plt.xlabel(xlabel)
    plt.ylabel('R2_score')
    plt.title(xlabel+" selcetion")
    plt.savefig(xlabel+' selcetion')
    plt.clf()
'''n_estimators参数: 树数量'''
nestimator_selection_vis([*range(30, 200, 10)], None, None, 'n_estimator')
'''subsample参数: 随机抽样过程中抽取样本比例, 范围 (0, 1]'''
nestimator_selection_vis(None, np.linspace(0.5, 1, 10).tolist(), None, 'subsample')
'''eta参数: XGB的学习率, 默认0.3'''
nestimator_selection_vis(None, None, np.linspace(0.2, 0.5, 20).tolist(), 'eta')

print('===============================Start Grid Search===============================')
'''网格搜索,grid search'''
params = {
    'n_estimators': [*range(30, 200, 10)],
    'subsample': np.linspace(0.5, 1, 10),
    'eta': np.linspace(0.2, 0.7, 20)
}

grid = GridSearchCV(xgb.XGBRegressor(), param_grid=params, cv=KFold(5, shuffle=True)).fit(x, y)
print('Best params: %s\nBest score: %.4f'%(grid.best_params_, grid.best_score_.mean()))

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/langs/715480.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-04-25
下一篇 2022-04-25

发表评论

登录后才能评论

评论列表(0条)

保存