理论帖很多,实战帖找起来实在不方便,自学整理了一些常用的机器学习算法和常用方法(网格,k折,插值,可视化等),仅供参考
预处理;https://github.com/xushige/Machine-Learning--Sklearn/tree/main/Data_Preprocess
from sklearn.impute import SimpleImputer
import pandas as pd
'''该文件不可运行 仅提供各种预处理方法的使用'''
data = 0
# 缺值填充
# strategy: 按列计算mean, median(中位数), most_frequent(众数), constant进行填充. defalut: mean
# fill_value: constant 可用,表示定值填充. default: None
si = SimpleImputer()
data = si.fit_transform(data) #data.shape==(n, m), n个样本,m维特征
# 分类文字型数据编码
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le = le.fit(data) # data一维数组
label = le.transform(data)
'''也可以直接label = le.fit_transform(data)'''
data.iloc[:, -1] = LabelEncoder().fit_transform(data.iloc[:, -1]) # 实际对DataFrame最后一列进行编码 *** 作
# 将分类特征转换为分类数值
from sklearn.preprocessing import OrdinalEncoder
data.iloc[:, :] = OrdinalEncoder().fit_transform(data.iloc[:, :]) # 相比LabelEncoder可以编码多列分类数据
# 将分类特征通过OrdinalEncoder直接变成0,1,2忽略了数字本身的关联性
# 因此使用哑变量更能准确刻画并行无关联的分类特征
from sklearn.preprocessing import OneHotEncoder
result = OneHotEncoder(categories='auto').fit_transform(data.iloc[:, :])
# 这里result是对指定列进行读热码转换,一列变为五列,因此需要将原列删除,将读热哑变量进行concat
data.drop(['column1', 'column2'])
data = pd.concat([data, pd.DataFrame(result)], axis=1) # 按列合并
data.columns = ['new_column0', 'new_column1', 'new_column2']
# 连续性特征根据阈值二值化
from sklearn.preprocessing import Binarizer
data.iloc[:, 0:1] = Binarizer(threshold=30).fit_transform(data.iloc[:, 0:1]) #不能使用一维数组
决策树;https://github.com/xushige/Machine-Learning--Sklearn/tree/main/Decision%20Tree
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
data = pd.read_csv('titanic_data/train.csv')
print(data.info(), data.head(10))
#删除无影响的特征
data.drop(['Cabin', 'Name', 'Ticket'], axis=1, inplace=True)
print(data.info())
#填补大量年龄缺失
data['Age'].fillna(data['Age'].mean(), inplace=True)
print(data.info())
#数据对齐,木桶原理,按最短的来,去除含nan的行/列
data.dropna(axis=0, inplace=True)
print(data.info())
#将object类型转换为int标签型, unique后是数组形式object型标签
transform = [data['Embarked'], data['Sex']]
for eachitem in transform:
eachitem.replace(eachitem.unique(), [i for i in range(len(eachitem.unique()))], inplace=True)
print(data.info())
#取出y标签,得到x数据
y = data['Survived']
data.drop('Survived', axis=1, inplace=True)
print(data.head(10), '\n', y)
#交叉验证则不需要切分数据集
X_train, X_test, Y_train, Y_test = train_test_split(data, y, test_size=0.3)
#网格搜索:枚举试验出得分最高的参数组合,填入实例化的模型中
parameters = {
'criterion': ('gini', 'entropy'),
'max_depth': [*range(1, 5)],
'min_samples_leaf': [*range(1, 10, 2)],
'min_impurity_decrease': np.arange(0, 0.5, 0.1)
}
clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=3, min_impurity_decrease=0)
Grid_Search = GridSearchCV(estimator=clf, param_grid=parameters, cv=10).fit(data, y)
print(Grid_Search.best_params_, Grid_Search.best_score_)
降维;https://github.com/xushige/Machine-Learning--Sklearn/tree/main/Decomposition
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
'''降维算法一定会带来信息损失'''
iris = load_iris()
x = iris['data']
y = iris['target']
print(x.shape, y.shape) # 四维数据
'''n_components: 降到某维'''
pca = PCA(n_components=2)
pca = pca.fit(x)
x_pca = pca.transform(x)
print(x_pca.shape)
'''PCA降维, 可视化数据分布'''
for classes in range(len(set(y))):
plt.scatter(x_pca[y==classes, 0], x_pca[y==classes, 1], label=iris['target_names'][classes])
plt.legend()
plt.title('PCA of IRIS dataset')
plt.savefig('PCA_visualization')
plt.clf()
'''explained_variance_: 可解释方差, 方差越大, 信息越丰富, 对于分类也越有效'''
explained_variance = pca.explained_variance_
explained_variance_ratio = pca.explained_variance_ratio_
print(explained_variance, explained_variance_ratio)
print('信息保留率:%f' % (explained_variance_ratio.sum()))
'''可视化n_components--explained_variance_ratios曲线, 手肘法选取n值'''
explained_variance_ratios = []
for n in range(1, 5):
pca = PCA(n)
pca = pca.fit(x)
explained_variance_ratios.append(pca.explained_variance_ratio_.sum())
plt.plot([*range(1, 5)], explained_variance_ratios)
plt.xticks([*range(1, 5)])
plt.xlabel('n-components')
plt.ylabel('explained_variance_ratio')
plt.grid(axis='y')
plt.savefig('n-componnets selection')
plt.clf()
Kmeans;https://github.com/xushige/Machine-Learning--Sklearn/tree/main/K_means
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score # 评估轮廓系数, 检测聚类效果
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
## 聚类思想:簇内差异小,簇外差异大
'''
创建数据集
n_samples: 数据量
n_features: 特征维度
centers: 类别数
'''
x, y = make_blobs(n_samples=500, n_features=2, centers=4, random_state=1)
'''可视化原始数据分布'''
fig, ax1 = plt.subplots(1) # 画布上仅有一个子图
ax1.scatter(x[:, 0], x[:, 1], marker='o')
plt.savefig('data_distribution')
plt.clf()
'''可视化聚类效果'''
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(x)
y_pred = kmeans.labels_ # 训练数据的预测标签
centers = kmeans.cluster_centers_ # 簇中心坐标
for each in range(n_clusters):
plt.scatter(x[y_pred==each, 0], x[y_pred==each, 1], marker='o', label='class:%d'%each)
plt.legend()
plt.savefig('kmeans_vis')
plt.clf()
'''轮廓系数'''
score = silhouette_score(x, y_pred)
print('轮廓系数为:【%.4f】'%score)
'''基于轮廓系数选择k值'''
res = []
for k in range(2, 10):
model = KMeans(k)
model.fit(x)
y_pred = model.labels_
res.append(silhouette_score(x, y_pred))
plt.plot([*range(2, 10)], res, marker='o')
plt.xlabel('K value selection')
plt.ylabel('silhouette_score')
plt.savefig('K_value_selection')
plt.clf()
线性回归;https://github.com/xushige/Machine-Learning--Sklearn/tree/main/Linear%20regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedShuffleSplit, ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_wine
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
'''数据预处理(加载,归一化,切分)'''
data = load_wine()
x, y = data['data'], data['target']
print('data shape: %s label shape: %s' % (x.shape, y.shape))
feature_names = data['feature_names']
x = pd.DataFrame(x, columns=feature_names)
x = StandardScaler().fit_transform(x)
trainx, testx, trainy, testy = train_test_split(x, y, test_size=0.3, shuffle=True)
'''回归模型建立'''
# fit_intercept: 是否计算截距,默认false
# normalize:按列进行归一化,相当于StandardScaler,默认false
# copy_X:在X.copy()上进行计算,默认True
# n_jobs:用于计算的作业数,如果输入-1,表示使用全部cpu计算,默认None
linear = LinearRegression(fit_intercept=True).fit(trainx, trainy)
score = linear.score(testx, testy)
coef = sorted(list(zip(feature_names, linear.coef_)), key=lambda x:x[1], reverse=True)
intercept = linear.intercept_
print('====================================================================================================')
print("When split-style is train/test==7:3, the score of linearregression is: [%.4f]\nThe intercept is: [%.4f]\nImportance condition:\n%s"%(score, intercept, coef))
print('====================================================================================================')
cross_score = cross_val_score(LinearRegression(), x, y, cv=ShuffleSplit(5, test_size=0.3))
print("When split-style is 5-fold, the score of linearregression is: [%.4f]"%(cross_score.mean()))
print('====================================================================================================')
'''回归模型评价指标(是否预测正确【precision】,是否拟合足够多的信息【recall】)'''
# 是否预测正确:【precision】
# Mean Squared Error(MSE): 均方误差, PS:【sklearn中MSE始终为负,表示损失loss】
print("MSE SCORE:")
mse_score = mean_squared_error(linear.predict(testx), testy)
print('When split-style is train/test==7:3, MSE score is: %.4f'%(mse_score))
cross_mse_score = cross_val_score(LinearRegression(), x, y, cv=ShuffleSplit(5, test_size=0.3), scoring='neg_mean_squared_error')
print("When split-style is 5-fold, the MSE core of linearregression is: [%.4f]"%(cross_mse_score.mean()))
print('====================================================================================================')
# 是否拟合足够多信息【recall】
# 1. R2_score: 直接调用model.score默认为r2score
print("R2 SCORE:")
print('When split-style is train/test==7:3, R2 score is: %.4f'%(score))
cross_r2_score = cross_val_score(LinearRegression(), x, y, cv=ShuffleSplit(5, test_size=0.3), scoring='r2')
print("When split-style is 5-fold, the R2 core of linearregression is: [%.4f]"%(cross_r2_score.mean()))
print('====================================================================================================')
# 可解释性方差
evs = explained_variance_score(testy, linear.predict(testx))
print('EVS of Linear Regression: %.4f'%(evs))
'''可视化'''
plt.plot(testy, label='True')
plt.plot(linear.predict(testx), label='Pred')
plt.legend()
plt.ylim(-4, 6)
plt.title('Linear Regression Effect')
plt.savefig('linear_regression_effect')
plt.clf()
逻辑回归;https://github.com/xushige/Machine-Learning--Sklearn/tree/main/Logistic%20regression
from sklearn import linear_model
from sklearn.datasets import load_breast_cancer
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
## 逻辑回归:尤其适合线性关系的拟合。
## y = sigmoid(wx)
'''数据集'''
data = load_breast_cancer()
x = data['data']
y = data['target']
print(x.shape, y.shape)
'''
逻辑回归模型
penalty: ['l1', 'l2'] default: 'l2'
C: 正则化强度的的倒数, (0, 1)的浮点数, C越小, 正则化强度越大
'''
lr1 = linear_model.LogisticRegression(penalty='l1', solver='liblinear', C=0.5, max_iter=1000)
lr2 = linear_model.LogisticRegression(penalty='l2', solver='liblinear', C=0.5, max_iter=1000)
lr1.fit(x, y)
lr2.fit(x, y)
# 查看每个特征的重要性, L1正则化会使得某些特征重要性变为0,相当于降维;而L2只会趋向于0
print(lr1.coef_.shape, lr2.coef_.shape)
'''切分数据集'''
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, shuffle=True)
'''C值选取'''
l1, l2 = [], []
for i in range(5, 101, 5):
c = i/100
lr1 = linear_model.LogisticRegression(penalty='l1', solver='liblinear', C=c, max_iter=1000)
lr1.fit(xtrain, ytrain)
acc1 = (lr1.predict(xtest)==ytest).sum() / len(ytest)
l1.append(acc1)
lr2 = linear_model.LogisticRegression(penalty='l2', solver='liblinear', C=c, max_iter=1000)
lr2.fit(xtrain, ytrain)
acc2 = (lr2.predict(xtest)==ytest).sum() / len(ytest)
l2.append(acc2)
plt.plot([i/100 for i in range(5, 101, 5)], l1, label='LR-L1')
plt.plot([i/100 for i in range(5, 101, 5)], l2, label='LR-L2')
plt.legend()
plt.xlabel('C value')
plt.title('c value selection')
plt.ylabel('Test, Accuracy')
plt.savefig('C_value_selection')
plt.clf()
朴素贝叶斯;https://github.com/xushige/Machine-Learning--Sklearn/tree/main/Naive%20Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
'''数据集载入,标准化,切分'''
data = load_digits()
x, y = data['data'], data['target']
x = StandardScaler().fit_transform(x)
print('data shape: %s label shape: %s' % (x.shape, y.shape))
trainx, testx, trainy, testy = train_test_split(x, y, test_size=0.3, shuffle=True)
'''朴素贝叶斯建模'''
# prior: y长度为类别数的一维数组形式,表示类别的先验概率,即P(Y=yi),不指定则自行根据数据计算.默认None
# var_smoothing: 浮点数,让计算平稳。默认1e-9
nb = GaussianNB().fit(trainx, trainy)
# R2-score
score = nb.score(testx, testy)
print('predict R2 score is: %.4f'%(score))
# 概率
prob = nb.predict_proba(testx)
print('probability shape: %s----表示每个样本经过最大后验估计后对应每个label的概率'%str(prob.shape))
'''混淆矩阵可视化'''
cm = confusion_matrix(testy, nb.predict(testx))
print('=======================================\nconfusion matrix:\n%s\n======================================='%(cm))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
plt.text(i-0.2, j+0.2, str(cm[j][i]), color='white' if cm[j][i]>35 else 'black')
plt.title('confusion matrix')
plt.xticks([*range(prob.shape[1])])
plt.yticks([*range(prob.shape[1])])
plt.xlabel('Pred Label')
plt.ylabel('True Label')
plt.tight_layout()
plt.savefig('confusion matrix')
随机森林;https://github.com/xushige/Machine-Learning--Sklearn/tree/main/Random%20Forest
from sklearn.datasets import load_wine
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
wine_data = load_wine()
Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine_data.data, wine_data.target, test_size=0.3)
# 实例化,还没有生成树,fit后生成决策树。
# 随机森林中random_state形成了25个固定random_state的树,并不是25个一样random_state的树
# bootstrap:n个样本
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=3, random_state=0)
rfc = RandomForestClassifier(n_estimators=35, criterion='entropy', max_depth=4, min_samples_leaf=1, random_state=0)
def normal_compare():
global clf, rfc
clf = clf.fit(Xtrain, Ytrain)
rfc = rfc.fit(Xtrain, Ytrain)
score_tree = clf.score(Xtest, Ytest)
score_forest = rfc.score(Xtest, Ytest)
print('Tree normal test socre: 【%f】, Forest normal test score:【%f】' % (score_tree, score_forest))
def cross_val():
global clf, rfc
cross_score_tree = cross_val_score(clf, wine_data.data, wine_data.target, cv=10).mean()
cross_score_forest = cross_val_score(rfc, wine_data.data, wine_data.target, cv=10).mean()
print('Tree corss_val_test socre:【%f】, Forest cross_val_test score:【%f】' % (cross_score_tree, cross_score_forest))
def forest_grid_search():
parameters = {
'n_estimators': [*range(10, 50, 5)],
'max_depth': [*range(1, 6)],
'min_samples_leaf': [*range(1, 10, 2)],
'criterion': ['entropy', 'gini']
}
GS = GridSearchCV(rfc, param_grid=parameters, cv=10).fit(wine_data.data, wine_data.target)
print('Forest best params: %s\n Forest best score:【%f】' % (GS.best_params_, GS.best_score_))
def tree_grid_search():
parameters = {
'max_depth': [*range(1, 6)],
'min_samples_leaf': [*range(1, 10, 2)],
'criterion': ['entropy', 'gini']
}
GS = GridSearchCV(clf, param_grid=parameters, cv=10).fit(wine_data.data, wine_data.target)
print('Tree best params: %s\nTree best score:【%f】' % (GS.best_params_, GS.best_score_))
normal_compare()
cross_val()
from numpy import random
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
boston = load_boston()
x, _, y, _ = train_test_split(boston['data'], boston['target'], shuffle=True, test_size=1)
'''制造缺值'''
missing_rate = 0.5
n_miss_samples = int(x.shape[0]*x.shape[1]*missing_rate)
rng = np.random.RandomState(0) #固定种子,正常用np.random.randint就可以
missing_features = rng.randint(0, x.shape[1], n_miss_samples)
missing_samples = rng.randint(0, x.shape[0], n_miss_samples)
x[missing_samples, missing_features] = np.nan #对应索引位置变为nan
x = pd.DataFrame(x)
print(x)
def mean_fill(x):
'''均值填充缺值'''
#实例化,将np.nan以均值进行填充,如果strategy是constant,通过fill_value赋值
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean', fill_value=0)
x_fill = imp_mean.fit_transform(x) #变换
print(pd.DataFrame(x_fill).isna().sum()) #查看是否还有nan值
rfr = RandomForestRegressor()
score = cross_val_score(rfr, x_fill, y, cv=10)
print(score)
def randomforest_fill(x):
'''随机森林填充缺值(实质是预测缺值)'''
sortidx = np.argsort(x.isnull().sum(axis=0)).values #按缺失值数量小到大对特征排序
print(sortidx)
x = x.to_numpy()
total_idx = [i for i in range(x.shape[1])]
print(x.shape)
for idx in sortidx:
temp = total_idx[:]
temp.pop(idx)
print(idx, temp)
leave = x[:, temp]
select = x[:, idx]
print(leave.shape, select.shape)
quit()
randomforest_fill(x)
支撑向量机;https://github.com/xushige/Machine-Learning--Sklearn/tree/main/SVM
from cgi import test
from sklearn.datasets import make_blobs, make_circles, load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np
import time
'''SVM二分类可视化'''
def svm_vis(x, y, filename, kernal='linear'):
print('x_size: %s, y_size:%s'%(x.shape, y.shape))
plt.scatter(x[:, 0], x[:, 1], c=y, s=50, cmap='rainbow')
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()
axisx = np.linspace(xlim[0], xlim[1], 30)
axisy = np.linspace(ylim[0], ylim[1], 30)
axisx, axisy = np.meshgrid(axisx, axisy)
xy = np.vstack([axisx.ravel(), axisy.ravel()]).T
clf = SVC(kernel=kernal).fit(x, y)
p = clf._decision_function(xy).reshape(axisx.shape)
ax.contour(axisx, axisy, p, levels=[-1, 0, 1], linestyles=['--', '-', '--'], colors='black')
plt.savefig(filename)
plt.clf()
# 线性数据分类
x, y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.6)
svm_vis(x, y, 'linear_clf', 'linear') # 线性核可分
# 环形数据分类
x, y = make_circles(n_samples=100, factor=0.1, noise=0.1)
svm_vis(x, y, 'circle_clf_linear', 'linear') # 线性核不可分
svm_vis(x, y, 'circle_clf_rbf', 'rbf') # 使用rbf高斯径向基核处理线性不可分数据
'''乳腺癌数据集svm分类'''
data = load_breast_cancer()
x, y = data['data'], data['target']
x = StandardScaler().fit_transform(x)
print('乳腺癌数据集详情:x_size: %s, y_size:%s'%(x.shape, y.shape))
# 降维可视化数据集分布
pca_xdata = PCA(2).fit_transform(x)
color = ['r', 'b']
plt.scatter(pca_xdata[:, 0][y==0], pca_xdata[:, 1][y==0])
plt.scatter(pca_xdata[:, 0][y==1], pca_xdata[:, 1][y==1])
plt.savefig("breast_dataset_vis")
plt.clf()
# 核函数性能探究
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, shuffle=True)
kernels = ['linear', 'poly', 'rbf', 'sigmoid'] # 线性核,高斯核,sigmoid核
for kernel in kernels:
start = time.time()
# cahce-size表示分配多少内存进行计算,越大越快
# degree:表示多项式核的次幂,degree=1时就是线性核(仅多项式核poly可调)
# gamma: {'scale', 'auto'} or float, default='scale'。(rbf,poly,sigmoid可调)
# coef0: float, default=0.0。(poly, sigmoid可调)
# C值:float, default=1,惩罚系数,C越大,惩罚越大,分类精确
clf = SVC(kernel=kernel, gamma='auto', cache_size=5000, degree=1)
clf.fit(xtrain, ytrain)
timespan = time.time() - start
score = clf.score(xtest, ytest)
print("【%s】 kernel's score is: %f, time-consumption is: %fs"%(kernel, score, timespan))
'''
【linear】 kernel's score is: 0.970760, time-consumption is: 0.002428s
【rbf】 kernel's score is: 0.959064, time-consumption is: 0.003242s
【sigmoid】 kernel's score is: 0.959064, time-consumption is: 0.002081s
从结果看,线性核效果较好,高斯核效果差很多,可能是量纲差距过大,或者过拟合
尝试标准化后高斯核效果与线性核差不多
【linear】 kernel's score is: 0.976608, time-consumption is: 0.001769s
【poly】 kernel's score is: 0.959064, time-consumption is: 0.001492s
【rbf】 kernel's score is: 0.976608, time-consumption is: 0.002371s
【sigmoid】 kernel's score is: 0.964912, time-consumption is: 0.002023s
'''
# 对poly核进行网格搜索选择gamma,coef0参数
cv = StratifiedShuffleSplit(5, test_size=0.3)
param_grid = {'gamma': np.logspace(-10, 1, 20), 'coef0':np.linspace(0, 5, 10)}
grid = GridSearchCV(SVC(cache_size=5000, degree=1, kernel='poly'), param_grid=param_grid, cv=cv).fit(x, y)
print('To "poly" kernel, The best parameters are [%s], score is [%f]' % (grid.best_params_, grid.best_score_))
# C值选取曲线
score = []
crange = np.linspace(0.01, 30, 50, dtype=np.float16)
for c in crange:
svm = SVC(C=c, cache_size=5000, kernel='linear').fit(xtrain, ytrain)
score.append(svm.score(xtest, ytest))
bestsocre = max(score)
bestc = crange[score.index(bestsocre)]
print('max score: %.4f, c value: %.4f'%(bestsocre, bestc))
plt.plot(crange, score)
plt.scatter([bestc], [bestsocre], c='r', s=40)
plt.title('C value selection')
plt.xlabel('C value')
plt.ylabel('score')
plt.xticks(crange, rotation=-270, fontsize=4)
plt.grid()
plt.rcParams['savefig.dpi'] = 400 # 图片像素
plt.rcParams['figure.dpi'] = 400 # 分辨率
plt.savefig('Cvalue_selection')
plt.clf()
XGBoost;https://github.com/xushige/Machine-Learning--Sklearn/tree/main/XGBoost
'''XGBoost'''
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, train_test_split, learning_curve
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
'''
XgbBoost与sklearn实 *** 有所区别:
xgb.Dmatrix() # 数据读取
param = {} # 参数设置
model = xgb.train(param) # 训练
model.predict() # 预测
'''
data = load_boston()
x, y = data['data'], data['target']
x = StandardScaler().fit_transform(x)
print('data.shaep: %s label.shape: %s' % (x.shape, y.shape))
print('===============================Start train-test-split===============================')
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, shuffle=True)
xgb_model = xgb.XGBRegressor().fit(xtrain, ytrain)
pre = xgb_model.predict(xtest)
score = xgb_model.score(xtest, ytest) # R2_score
print('XGBoost\'s R2_score in Boston dataset : [%.4f]'%(score))
mse_error = mean_squared_error(ytest, pre)
print('XGBoost\'s MSE error in Boston dataset : [%.4f]'%(mse_error))
feature_importance = sorted(zip(xgb_model.feature_importances_, data['feature_names']), reverse=True)
print('\nTree model中可以调用feature importance进行特征选择,如下:\n\n%s\n'%feature_importance)
print('===============================Start Cross-val===============================')
cross_score = cross_val_score(xgb.XGBRegressor(), x, y, cv=KFold(5, shuffle=True)) # R2_score与model.score保持一致,回归模型:R2-score;分类模型:Accuracy
cross_score_rf = cross_val_score(RandomForestRegressor(), x, y, cv=KFold(5, shuffle=True)) # random forest
cross_score_linear = cross_val_score(LinearRegression(), x, y, cv=KFold(5, shuffle=True)) # linear regression
print('R2_score metrics in Boston dataset\n XGBoost : [%.4f]\n Random Forest: [%.4f]\n Linear Regression : [%.4f]'%(cross_score.mean(), cross_score_rf.mean(), cross_score_linear.mean()))
print('===============================Start Visulization===============================')
def plot_learning_curve(estimator, title, x, y, cv=None):
train_sizes, train_scores, test_scores = learning_curve(estimator, x, y, shuffle=True, cv=cv)
plt.plot(train_sizes, train_scores.mean(axis=1), 'o-', color='r', label='Training Score', linewidth=3)
plt.plot(train_sizes, test_scores.mean(axis=1), 'o-', color='black', label='Test Score', linewidth=3)
plt.legend()
plt.title(title)
plt.xlabel('Training Samples')
plt.ylabel('Score')
plt.grid()
plt.savefig(title)
plt.clf()
plot_learning_curve(xgb.XGBRegressor(n_estimators=100), 'XGBoost_learning_curve', x, y, cv=KFold(5, shuffle=True))
def nestimator_selection_vis(nestimators=None, subsamples=None, etas=None, xlabel=None):
if nestimators != None:
params = nestimators
elif subsamples != None:
params = subsamples
elif etas != None:
params = etas
scores = [] # 得分
vars = [] # 方差
ges = [] # 泛化误差
for param in params:
if isinstance(nestimators, list):
model = xgb.XGBRegressor(n_estimators = param)
elif isinstance(subsamples, list):
model = xgb.XGBRegressor(subsample = param)
elif isinstance(etas, list):
model = xgb.XGBRegressor(eta = param)
result = cross_val_score(model, x, y, cv=KFold(5, shuffle=True))
scores.append(result.mean())
vars.append(result.var())
ges.append((1-result.mean())**2 + result.var())
idx_maxscore = scores.index(max(scores))
idx_minvar = vars.index(min(vars))
idx_minge = ges.index(min(ges))
print('按照 最大 R2_score 来挑选,此时%s: 【%f】 score: %.4f var: %.4f ge: %.4f'%(xlabel, params[idx_maxscore], scores[idx_maxscore], vars[idx_maxscore], ges[idx_maxscore]))
print('按照 最小 Variance 来挑选,此时%s: 【%f】 score: %.4f var: %.4f ge: %.4f'%(xlabel, params[idx_minvar], scores[idx_minvar], vars[idx_minvar], ges[idx_minvar]))
print('按照 最小 Generalization error 来挑选,此时%s: 【%f】 score: %.4f var: %.4f ge: %.4f\n'%(xlabel, params[idx_minge], scores[idx_minge], vars[idx_minge], ges[idx_minge]))
plt.plot(params, scores, 'o-', linewidth=2, label='score', color='#CB181B')
plt.fill_between(params, np.array(scores)-np.array(vars), np.array(scores)+np.array(vars), color='#CB181B', alpha=0.3)
plt.legend()
plt.xlabel(xlabel)
plt.ylabel('R2_score')
plt.title(xlabel+" selcetion")
plt.savefig(xlabel+' selcetion')
plt.clf()
'''n_estimators参数: 树数量'''
nestimator_selection_vis([*range(30, 200, 10)], None, None, 'n_estimator')
'''subsample参数: 随机抽样过程中抽取样本比例, 范围 (0, 1]'''
nestimator_selection_vis(None, np.linspace(0.5, 1, 10).tolist(), None, 'subsample')
'''eta参数: XGB的学习率, 默认0.3'''
nestimator_selection_vis(None, None, np.linspace(0.2, 0.5, 20).tolist(), 'eta')
print('===============================Start Grid Search===============================')
'''网格搜索,grid search'''
params = {
'n_estimators': [*range(30, 200, 10)],
'subsample': np.linspace(0.5, 1, 10),
'eta': np.linspace(0.2, 0.7, 20)
}
grid = GridSearchCV(xgb.XGBRegressor(), param_grid=params, cv=KFold(5, shuffle=True)).fit(x, y)
print('Best params: %s\nBest score: %.4f'%(grid.best_params_, grid.best_score_.mean()))
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)