机器学习(Hands on)第二章修正版完整代码

机器学习(Hands on)第二章修正版完整代码,第1张

前言

最近尝试学习机器学习有关知识,随着Python版本的更新,《Hands-On Machine Learning with Scikit-Learn & TensorFlow》书中部分代码并不适用,根据百度查到的一些改动做了总结(具体改动部分已经忘记,想要了解细节的读者可以去书中自己比较),下面是第二章完整代码:

代码
import numpy as np
import os
import pandas as pd
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path): #如果文件夹不存在,则创建一个
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path) #从 housing_url下载文件到 tgz_path
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path) #解压文件
    housing_tgz.close()
#使用Pandas来读取数据
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)
housing=load_housing_data()
import matplotlib.pyplot as plt

#创建测试数据集
def split_train_test(data,test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices],data.iloc[test_indices]
import hashlib

#hash值的最后一个字节小于51的划入测试集
def test_set_check(identifier, test_ratio, hash):
    return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio

def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))
    return data.loc[~in_test_set], data.loc[in_test_set]
#利用行号创建标识符
#housing_with_id = housing.reset_index() # 加入 'index' 列
#train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, 'index')
#新的方法产生唯一标识符
#housing_with_id['id'] = housing['longitude']*1000 + housing['latitude']
#train_set, test_set = split_train_test_by_id(housing_with_id, 0.2,'id')
#Scikit-Learn自带的函数train_test_split()
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

#分层抽样,整理数据
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
#使用Scikit-Learn's StratifiedShuffleSplit()进行分层抽样
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
 strat_train_set = housing.loc[train_index]
 strat_test_set = housing.loc[test_index]
#为了使数据恢复原来的样子,需要删除income_cat这一列。


for data in (strat_train_set, strat_test_set): data.drop(['income_cat'],axis=1,inplace=True) #3可视化来发现数据的规律 #从蓝到红表示数值从高到低 housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.4, s=housing['population']/100, label='population', c='median_house_value', cmap=plt.get_cmap('jet'), colorbar=True) plt.legend() #寻找相关性 from pandas.plotting import scatter_matrix attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"] scatter_matrix(housing[attributes],figsize=(12,8)) #特征组合 housing["rooms_per_household"] = housing['total_rooms'] / housing['households'] housing["bedrooms_per_room"] = housing["total_bedrooms"] / housing['total_rooms'] housing["population_per_household"]= housing["population"] / housing["households"] corr_matrix = housing.corr() corr_matrix["median_house_value"].sort_values(ascending=False) #把特征值和目标值分开,方便后续做特征转换 housing = strat_train_set.drop("median_house_value", axis=1) housing_labels = strat_train_set["median_house_value"].copy() #total_bedrooms属性中存在缺失值,缺失值的处理 #删除有缺失值的数据点 #删除整个total_bedrooms属性 #用值来填充缺失值(0,平均数,中位数等) #housing.dropna(subset['total_bedrooms']) #option1 #housing.drop('total_bedrooms',axis=1) #option2 #median = housing['total_bedrooms'].median() #housing['total_bedrooms'].fillna(median) #option3 #使用第三种方法来填充缺失值时,在测试集上也应该使用同样的中位数数值填充缺失值。


使用Scikit-Learn的Imputer来实现缺失值的填充。


try: from sklearn.impute import SimpleImputer # Scikit-Learn 0.20+ except ImportError: from sklearn.preprocessing import Imputer as SimpleImputer #create an imputer instances imputer = SimpleImputer(strategy='median') #specify median method housing_num = housing.drop("ocean_proximity", axis=1) #drop non-numerical attribute imputer.fit(housing_num) #fit the imputer instance to the training data X = imputer.transform(housing_num) #replacing missing values with learned medians housing_tr = pd.DataFrame(X, columns=housing_num.columns,index=housing.index) #convert Numpy arrays into pandas dataframe #文本和类别数据的处理 #使用Scikit-Learn的LabelEncoder将文本数据转变为数值型数据。


from sklearn.preprocessing import LabelEncoder encoder = LabelEncoder() housing_cat = housing["ocean_proximity"] housing_cat_encoded = encoder.fit_transform(housing_cat) print(housing_cat_encoded) #Scikit-Learn中提供OneHotEncoder编码可以将字符型的类别变量转换成独热编码的向量 from sklearn.preprocessing import OneHotEncoder encoder = OneHotEncoder() housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1)) housing_cat_1hot #<16512x5 sparse matrix of type '' # with 16512 stored elements in Compressed Sparse Row format> #稀疏矩阵转为稠密 housing_cat_1hot.toarray() #自定义转换器 #使用Scikit-Learn的FunctionTransformer类可以基于转换函数构建转换器 from sklearn.preprocessing import LabelBinarizer from sklearn.base import BaseEstimator, TransformerMixin class MyLabelBinarizer(TransformerMixin): def __init__(self, *args, **kwargs): self.encoder = LabelBinarizer(*args, **kwargs) def fit(self, x, y=0): self.encoder.fit(x) return self def transform(self, x, y=0): return self.encoder.transform(x) encoder=MyLabelBinarizer() housing_cat_1hot = encoder.fit_transform(housing_cat) housing_cat_1hot rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6 class CombinedAttributesAdder(BaseEstimator, TransformerMixin): def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs self.add_bedrooms_per_room = add_bedrooms_per_room def fit(self, X, y=None): return self # nothing else to do def transform(self, X, y=None): rooms_per_household = X[:, rooms_ix] / X[:, household_ix] population_per_household = X[:, population_ix] / X[:, household_ix] if self.add_bedrooms_per_room: bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix] return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room] else: return np.c_[X, rooms_per_household, population_per_household] attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False) housing_extra_attribs = attr_adder.transform(housing.values) #特征缩放 #Transformation Pipelines #Scikit-Learn中提供了Pipeline类来完成转换序列,使得程序能够按顺序执行每个转换。


from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) housing_num_tr = num_pipeline.fit_transform(housing_num) #类别型额变量设置transformer pipeline try: from sklearn.compose import ColumnTransformer except ImportError: from future_encoders import ColumnTransformer num_attribs = list(housing_num) cat_attribs = ['ocean_proximity'] full_pipeline = ColumnTransformer([ ('num', num_pipeline, num_attribs), ('cat', OneHotEncoder(), cat_attribs), ]) housing_prepared = full_pipeline.fit_transform(housing) #在训练集上进行训练和验证 from sklearn.linear_model import LinearRegression lin_reg = LinearRegression() lin_reg.fit(housing_prepared, housing_labels) #在部分数据上查看预测效果 # try it out on some training instances some_data = housing.iloc[:5] some_labels = housing_labels.iloc[:5] some_data_prepared = full_pipeline.transform(some_data) #data transformation print('Predictions:\t\t', lin_reg.predict(some_data_prepared)) #使用 Scikit-Learn 中的 mean_squared_error函数,计算 RMSE from sklearn.metrics import mean_squared_error housing_predictions = lin_reg.predict(housing_prepared) lin_mse = mean_squared_error(housing_labels, housing_predictions) lin_rmse = np.sqrt(lin_mse) #使用 Scikit-Learn 中的 mean_absolute_error函数,计算 MAE。


from sklearn.metrics import mean_absolute_error lin_mae=mean_absolute_error(housing_labels,housing_predictions) #使用决策树算法对数据进行拟合。


from sklearn.tree import DecisionTreeRegressor tree_reg = DecisionTreeRegressor() tree_reg.fit(housing_prepared, housing_labels) housing_predictions = tree_reg.predict(housing_prepared) tree_mse = mean_squared_error(housing_labels, housing_predictions) tree_rmse = np.sqrt(tree_mse) #6.微调模型 #计算交叉验证的得分 from sklearn.model_selection import cross_val_score scores = cross_val_score(tree_reg, housing_prepared, housing_labels,scoring="neg_mean_squared_error", cv=10) tree_rmse_scores = np.sqrt(-scores) def display_scores(scores): print("Scores:", scores) print("Mean:", scores.mean()) print("Standard deviation:", scores.std()) #计算线性回归的交叉验证得分 lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) lin_rmse_scores = np.sqrt(-lin_scores) display_scores(lin_rmse_scores) #使用随机森林来做预测,同时计算其交叉验证得分。


from sklearn.ensemble import RandomForestRegressor forest_reg = RandomForestRegressor(n_estimators=10, random_state=42) forest_reg.fit(housing_prepared, housing_labels) # calculate the mean_squared_error for Random Forest Regressor housing_predictions = forest_reg.predict(housing_prepared) forest_mse = mean_squared_error(housing_labels, housing_predictions) forest_rmse = np.sqrt(forest_mse) forest_scores = cross_val_score(forest_reg, housing_prepared,housing_labels, scoring='neg_mean_squared_error',cv=10) forest_rmse_scores = np.sqrt(-forest_scores) #print(display_scores(forest_rmse_scores)) #使用线性核的SVM作为分类器,并计算其RMSE。


from sklearn.svm import SVR svm_reg = SVR(kernel='linear') svm_reg.fit(housing_prepared, housing_labels) housing_predictions = svm_reg.predict(housing_prepared) svm_mse = mean_squared_error(housing_labels, housing_predictions) svm_rmse = np.sqrt(svm_mse) #使用 Scikit-Learn 的 GridSearchCV来帮助选择参数 from sklearn.model_selection import GridSearchCV param_grid = [ {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}, ] forest_reg = RandomForestRegressor() grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error') grid_search.fit(housing_prepared, housing_labels) grid_search.best_params_ #grid_search.best_estimator_) # look at the score of each hyperparameter combination tested during the grid search cvres = grid_search.cv_results_ for mean_score, params in zip(cvres['mean_test_score'],cvres['params']): print(np.sqrt(-mean_score),params) # 以 dataframe 的方式显示结果 pd.DataFrame(grid_search.cv_results_) #使用随机搜索来进行参数选择。


from sklearn.model_selection import RandomizedSearchCV from scipy.stats import randint param_distribs = { 'n_estimators': randint(low=1, high=200), 'max_features': randint(low=1, high=8), } forest_reg = RandomForestRegressor(random_state=42) rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs, n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42) rnd_search.fit(housing_prepared, housing_labels) #输出每个属性值对于正确预测的相对重要程度。


feature_importances = grid_search.best_estimator_.feature_importances_ print(feature_importances) extra_attribs = ['rooms_per_hhold','pop_per_hhold', 'bedrooms_per_room'] cat_encoder = full_pipeline.named_transformers_['cat'] cat_one_hot_attribs = list(cat_encoder.categories_[0]) attributes = num_attribs + extra_attribs + cat_one_hot_attribs sorted(zip(feature_importances, attributes), reverse=True) #在测试集上评估系统性能 final_model = grid_search.best_estimator_ X_test = strat_test_set.drop('median_house_value', axis=1) y_test = strat_test_set['median_house_value'].copy() X_test_prepared = full_pipeline.transform(X_test) final_predictions = final_model.predict(X_test_prepared) final_mse = mean_squared_error(y_test, final_predictions) final_rmse = np.sqrt(final_mse) print(final_rmse) #计算测试集的RMSE95%的置信区间。


# we can compute a 95% confidence interval for the test RMSE from scipy import stats confidence = 0.95 squared_errors = (final_predictions - y_test) ** 2 mean = squared_errors.mean() m = len(squared_errors) np.sqrt(stats.t.interval(confidence, m-1,loc=np.mean(squared_errors), scale=stats.sem(squared_errors))) # we could also compute the interval manually like this tscore = stats.t.ppf((1 + confidence)/2, df=m-1) tmargin = tscore * squared_errors.std(ddof=1) / np.sqrt(m) np.sqrt(mean - tmargin), np.sqrt(mean + tmargin) # Alternatively, we could use a z-scores rather than t-scores zscore = stats.norm.ppf((1 + confidence) / 2) zmargin = zscore * squared_errors.std(ddof=1) / np.sqrt(m) np.sqrt(mean - zmargin), np.sqrt(mean + zmargin) #整合数据准备和预测的Pipeline full_pipeline_with_predictor = Pipeline([ ('preparation', full_pipeline), ('linear', LinearRegression()) ]) full_pipeline_with_predictor.fit(housing, housing_labels) full_pipeline_with_predictor.predict(some_data) #使用joblib保存模型 my_model = full_pipeline_with_predictor from sklearn.externals import joblib joblib.dump(my_model, 'my_model.pkl') #save model my_model_loaded = joblib.load('my_model.pkl') #load model

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/langs/568490.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-04-09
下一篇 2022-04-09

发表评论

登录后才能评论

评论列表(0条)

保存