# 导入需要的库 import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.ensemble import RandomForestRegressor as RFR from sklearn.model_selection import train_test_split, cross_val_score from sklearn.preprocessing import StandardScaler, OrdinalEncoder from sklearn.impute import SimpleImputer from sklearn.feature_selection import VarianceThreshold, SelectKBest from sklearn.feature_selection import mutual_info_regression as MIC import xgboost as xgb import lightgbm as lgb from sklearn.metrics import mean_squared_error, mean_absolute_error
# 读取文件 train = pd.read_csv('used_car_train_20200313.csv',sep=' ') test = pd.read_csv('used_car_testB_20200421.csv',sep=' ') print('train:{}'.format(train.shape)) print('test:{}'.format(test.shape))
train:(150000, 31) test:(50000, 30)
# 查看数据信息 train.info()
RangeIndex: 150000 entries, 0 to 149999 Data columns (total 31 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SaleID 150000 non-null int64 1 name 150000 non-null int64 2 regDate 150000 non-null int64 3 model 149999 non-null float64 4 brand 150000 non-null int64 5 bodyType 145494 non-null float64 6 fuelType 141320 non-null float64 7 gearbox 144019 non-null float64 8 power 150000 non-null int64 9 kilometer 150000 non-null float64 10 notRepairedDamage 150000 non-null object 11 regionCode 150000 non-null int64 12 seller 150000 non-null int64 13 offerType 150000 non-null int64 14 creatDate 150000 non-null int64 15 price 150000 non-null int64 16 v_0 150000 non-null float64 17 v_1 150000 non-null float64 18 v_2 150000 non-null float64 19 v_3 150000 non-null float64 20 v_4 150000 non-null float64 21 v_5 150000 non-null float64 22 v_6 150000 non-null float64 23 v_7 150000 non-null float64 24 v_8 150000 non-null float64 25 v_9 150000 non-null float64 26 v_10 150000 non-null float64 27 v_11 150000 non-null float64 28 v_12 150000 non-null float64 29 v_13 150000 non-null float64 30 v_14 150000 non-null float64 dtypes: float64(20), int64(10), object(1) memory usage: 35.5+ MB
# 查看数据前五行 train.head()
5 rows × 31 columns
# 查看测试集信息 test.info()
RangeIndex: 50000 entries, 0 to 49999 Data columns (total 30 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SaleID 50000 non-null int64 1 name 50000 non-null int64 2 regDate 50000 non-null int64 3 model 50000 non-null float64 4 brand 50000 non-null int64 5 bodyType 48496 non-null float64 6 fuelType 47076 non-null float64 7 gearbox 48032 non-null float64 8 power 50000 non-null int64 9 kilometer 50000 non-null float64 10 notRepairedDamage 50000 non-null object 11 regionCode 50000 non-null int64 12 seller 50000 non-null int64 13 offerType 50000 non-null int64 14 creatDate 50000 non-null int64 15 v_0 50000 non-null float64 16 v_1 50000 non-null float64 17 v_2 50000 non-null float64 18 v_3 50000 non-null float64 19 v_4 50000 non-null float64 20 v_5 50000 non-null float64 21 v_6 50000 non-null float64 22 v_7 50000 non-null float64 23 v_8 50000 non-null float64 24 v_9 50000 non-null float64 25 v_10 50000 non-null float64 26 v_11 50000 non-null float64 27 v_12 50000 non-null float64 28 v_13 50000 non-null float64 29 v_14 50000 non-null float64 dtypes: float64(20), int64(9), object(1) memory usage: 11.4+ MB
train['notRepairedDamage'].value_counts()
0.0 111361 - 24324 1.0 14315 Name: notRepairedDamage, dtype: int64
train['notRepairedDamage'].value_counts()
0.0 111361 - 24324 1.0 14315 Name: notRepairedDamage, dtype: int64
# 将‘-’转化成空值,并将notRepairedDamage特征转换成数值型 train['notRepairedDamage'] = train['notRepairedDamage'].replace('-', np.nan).astype('float') train['notRepairedDamage'].value_counts()
0.0 111361 1.0 14315 Name: notRepairedDamage, dtype: int64
test['notRepairedDamage'] = test['notRepairedDamage'].replace('-', np.nan).astype('float') test['notRepairedDamage'].value_counts()
0.0 37224 1.0 4707 Name: notRepairedDamage, dtype: int64
# 查看缺失值特征, 看到全是分类特征 print(train['model'].value_counts()) print(train['bodyType'].value_counts()) print(train['fuelType'].value_counts()) print(train['gearbox'].value_counts())
0.0 11762 19.0 9573 4.0 8445 1.0 6038 29.0 5186 ... 240.0 2 209.0 2 245.0 2 242.0 2 247.0 1 Name: model, Length: 248, dtype: int64 0.0 41420 1.0 35272 2.0 30324 3.0 13491 4.0 9609 5.0 7607 6.0 6482 7.0 1289 Name: bodyType, dtype: int64 0.0 91656 1.0 46991 2.0 2212 3.0 262 4.0 118 5.0 45 6.0 36 Name: fuelType, dtype: int64 0.0 111623 1.0 32396 Name: gearbox, dtype: int64
# 获取有缺失值的特征 col_train_null = train.columns[train.isnull().any()].to_list() col_test_null = test.columns[test.isnull().any()].to_list() print(col_train_null) print(col_test_null)
['model', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage'] ['bodyType', 'fuelType', 'gearbox', 'notRepairedDamage']
# 使用SimpleImputer进行缺失值填充 imp = SimpleImputer(strategy='most_frequent') train[col_train_null] = imp.fit_transform(train[col_train_null]) test[col_train_null] = imp.fit_transform(test[col_train_null])
# 检查特征 train.isnull().any().sum()
0
test.isnull().any().sum()
0
# 检查日期列的异常值 train['regDate'].astype('str').str[4:6].value_counts()
03 14949 06 13809 04 12798 05 12614 07 11937 10 11490 00 11347 11 10687 12 10637 09 10522 01 9943 08 9936 02 9331 Name: regDate, dtype: int64
# 定义函数,用于转换月份为零的值 def tran_date(x): month = int(x[4:6]) if month == 0: month = 1 return x[0:4] + '-' + str(month) + '-' + x[6:]
# 日期替换 train['regDate'] = pd.to_datetime(train['regDate'].astype('str').apply(tran_date)) test['regDate'] = pd.to_datetime(test['regDate'].astype('str').apply(tran_date))
train['creatDate'] = pd.to_datetime(train['creatDate'].astype('str').apply(tran_date)) test['creatDate'] = pd.to_datetime(test['creatDate'].astype('str').apply(tran_date))
# 密度图查看price列 sns.distplot(train['price'])
# 取对数调整偏态 sns.distplot(np.log(train['price']))
# 查看一下分布 train['price'].describe([0.01,0.25,0.5,0.75,0.99])
count 150000.000000 mean 5923.327333 std 7501.998477 min 11.000000 1% 150.000000 25% 1300.000000 50% 3250.000000 75% 7700.000000 99% 34950.000000 max 99999.000000 Name: price, dtype: float64
# 查看年份和价格的关系 train.resample('Y', on='regDate')['price'].mean().to_period('Y').plot(kind='bar')
# 创造特征 train['diff_day'] = (train['creatDate'] - train['regDate']).dt.days train['diff_year'] = round(train['diff_day'] / 365, 1) train['regDate_year'] = train['regDate'].dt.year train['regDate_month'] = train['regDate'].dt.month train['regDate_day'] = train['regDate'].dt.day train['creatDate_year'] = train['creatDate'].dt.year train['creatDate_month'] = train['creatDate'].dt.month train['creatDate_day'] = train['creatDate'].dt.day
test['diff_day'] = (test['creatDate'] - test['regDate']).dt.days test['diff_year'] = round(test['diff_day'] / 365, 1) test['regDate_year'] = test['regDate'].dt.year test['regDate_month'] = test['regDate'].dt.month test['regDate_day'] = test['regDate'].dt.day test['creatDate_year'] = test['creatDate'].dt.year test['creatDate_month'] = test['creatDate'].dt.month test['creatDate_day'] = test['creatDate'].dt.day
# 查看name特征有多少种类 train['name'].unique().shape
(99662,)
# 对power的描述性统计 train['power'].describe()
count 150000.000000 mean 119.316547 std 177.168419 min 0.000000 25% 75.000000 50% 110.000000 75% 150.000000 max 19312.000000 Name: power, dtype: float64
# 对model的描述性统计 train['model'].describe()
count 150000.000000 mean 47.128707 std 49.536024 min 0.000000 25% 10.000000 50% 30.000000 75% 66.000000 max 247.000000 Name: model, dtype: float64
# 对power进行分箱 bin = [i*20 for i in range(0,31)] train['power_bin'] = pd.cut(train['power'], bin, labels=False).fillna(31) test['power_bin'] = pd.cut(test['power'], bin, labels=False).fillna(31)
# 对model进行分箱 bin_model = [i*10 for i in range(0,26)] train['model_bin'] = pd.cut(train['model'], bin_model, labels=False) test['model_bin'] = pd.cut(test['model'], bin_model, labels=False) train['model_bin'].value_counts()
0.0 25963 1.0 21123 2.0 18095 4.0 14872 3.0 11069 6.0 8748 7.0 5193 5.0 4629 8.0 3879 10.0 3818 11.0 3376 9.0 2550 12.0 2417 16.0 2096 15.0 1993 17.0 1699 13.0 1623 14.0 1162 18.0 988 19.0 860 21.0 771 20.0 640 22.0 473 23.0 171 24.0 29 Name: model_bin, dtype: int64
# 找出分类型特征 col_clf = ['brand', 'bodyType', 'fuelType', 'gearbox', 'kilometer', 'notRepairedDamage', 'seller', 'offerType']
train[col_clf]
150000 rows × 8 columns
# 观察分类型特征的数据分布 plt.figure(figsize=(18,10)) for i in range(len(col_clf)): plt.subplot(2,4,i+1) train[col_clf[i]].value_counts().plot(kind='bar',color='yellow') test[col_clf[i]].value_counts().plot(kind='bar',color='blue') plt.title(col_clf[i]) plt.tight_layout()
# 可以发现其中有两个特征只有一种类型,删除seller和offerType列 train = train.drop(['seller', 'offerType'], axis=1) test = test.drop(['seller', 'offerType'], axis=1)
col_clf = ['brand', 'bodyType', 'fuelType', 'gearbox', 'kilometer', 'notRepairedDamage']
# 查看不同分类和价格的关系 plt.figure(figsize=(18,10)) for i in range(len(col_clf)): plt.subplot(2,3,i+1) train.groupby(col_clf[i])['price'].mean().plot(kind='bar') plt.title(col_clf[i]) plt.tight_layout()
# 删去id这一列 train = train.drop(['SaleID'], axis=1) test = test.drop(['SaleID'], axis=1)
# 绘制热力图,观察特征之间的关系 plt.figure(figsize=(10,10)) corr = train.corr() sns.heatmap(corr)
# 将name列转换成计数 train['name_count'] = train.groupby('name')['brand'].agg(['count']) test['name_count'] = test.groupby('name')['brand'].agg(['count'])
# 删除name列 train = train.drop('name', axis=1) test = test.drop('name', axis=1)
# 将分类特征和价格组合出新的特征 col_clf = ['brand', 'model', 'kilometer', 'fuelType', 'bodyType'] for col in col_clf: train_gb = train.groupby(col) all_info = {} for kind, kind_data in train_gb: info = {} info[col + '_amount'] = len(kind_data) info[col + '_price_max'] = kind_data.price.max() info[col + '_price_median'] = kind_data.price.median() info[col + '_price_min'] = kind_data.price.min() info[col + '_price_sum'] = kind_data.price.sum() info[col + '_price_std'] = kind_data.price.std() info[col+'_price_average'] = round(kind_data.price.sum() / (len(kind_data) + 1), 2) all_info[kind] = info fe = pd.Dataframe(all_info).T.reset_index().rename(columns={'index':col}) train = train.copy().merge(fe, how='left', on=col) test = test.copy().merge(fe, how='left', on=col) print(train.shape) print(test.shape)
(150000, 73) (50000, 72)
# kilometer和power组合形成新特征 col_kp = ['kilometer', 'power'] t1 = train.groupby(col_kp[0], as_index=False)[col_kp[1]].agg({ col_kp[0] + '_' + col_kp[1] + '_count':'count', col_kp[0] + '_' + col_kp[1] + '_max':'max', col_kp[0] + '_' + col_kp[1] + '_median':'median', col_kp[0] + '_' + col_kp[1] + '_min':'min', col_kp[0] + '_' + col_kp[1] + '_sum':'sum', col_kp[0] + '_' + col_kp[1] + '_std':'std', col_kp[0] + '_' + col_kp[1] + '_mean':'mean' }) train = train.copy().merge(t1, how='left', on=col_kp[0]) test = test.copy().merge(t1, how='left', on=col_kp[0]) print(train.shape) print(test.shape)
(150000, 80) (50000, 79)
# 将与价格相关性高的匿名特征分别进行计算,生成新的特征 col_v = [0,3,8,12] for i in col_v: for j in col_v: train[str(i)+'*'+str(j)] = train['v_'+str(i)] * train['v_'+str(j)] test[str(i)+'*'+str(j)] = test['v_'+str(i)] * test['v_'+str(j)] for i in col_v: for j in col_v: train[str(i)+'+'+str(j)] = train['v_'+str(i)] + train['v_'+str(j)] test[str(i)+'+'+str(j)] = test['v_'+str(i)] + test['v_'+str(j)] for i in col_v: for j in col_v: train[str(i)+'-'+str(j)] = train['v_'+str(i)] - train['v_'+str(j)] test[str(i)+'-'+str(j)] = test['v_'+str(i)] - test['v_'+str(j)] for i in col_v: train[str(i)+'*diff_year'] = train['v_'+str(i)] * train['diff_year'] test[str(i)+'*diff_year'] = test['v_'+str(i)] * test['diff_year'] print(train.shape) print(test.shape)
(150000, 132) (50000, 131)
# 深复制 train_new = train.copy(deep=True) test_new = test.copy(deep=True)
# 删除没有用的特征 X_train = train_new.drop(['price', 'regDate', 'creatDate', 'regionCode'], axis=1) X_test = test_new.drop(['regDate', 'creatDate', 'regionCode'], axis=1) y_train = train_new['price']
X_train.to_csv('X_train.csv') X_test.to_csv('X_test.csv') y_train.to_csv('y_train.csv')
X_train = pd.read_csv('X_train.csv', index_col=0) X_test = pd.read_csv('X_test.csv', index_col=0) y_train = pd.read_csv('y_train.csv', index_col=0) y_train = np.ravel(y_train)
from sklearn.model_selection import GridSearchCV from lightgbm.sklearn import LGBMRegressor from time import time import datetime
# lgbm模型调参 n_estimators lgbm_scores = [] time0 = time() for i in np.arange(200, 2001, 100): reg_lgbm = LGBMRegressor(learning_rate=0.1, n_estimators=i, objective='regression_l1', random_state=42) lgbm_score = cross_val_score(reg_lgbm, X_train, y_train, cv=3, scoring='neg_mean_absolute_error').mean() lgbm_scores.append(lgbm_score) print(time() - time0, lgbm_score) print(max(lgbm_scores)) print(np.arange(200, 2001, 100)[np.argmax(lgbm_scores)]) plt.figure(figsize=(8,6)) plt.plot(np.arange(200, 2001, 100), lgbm_scores)
8.246474504470825 -594.8424586944835 19.095320463180542 -568.7034240842332 32.98492646217346 -552.3891983733455 48.85633111000061 -542.3964373457885 67.14593052864075 -535.7482170534481 87.76364278793335 -529.8703107609151 111.7964539527893 -525.7724715224499 137.1214382648468 -522.2536456032711 164.56334352493286 -519.7242720183268 194.57858514785767 -517.0013123928143 226.66670727729797 -515.3243156345435 261.38421607017517 -513.8374464322388 298.36878204345703 -512.2929007437376 337.6414248943329 -511.0992392114774 379.26119804382324 -510.01377926737086 423.065954208374 -508.88194129037237 468.8004615306854 -507.9469756382571 516.7109439373016 -506.9312361319216 567.1637334823608 -505.91707028368864 -505.91707028368864 2000
# lgbm模型调参 max_depth和num_leaves parameters = { 'max_depth':[4,5,6,7], 'num_leaves':np.arange(5,100,5) } reg_lgbm = LGBMRegressor(learning_rate=0.1, n_estimators=2000, objective='regression_l1', random_state=42) gs = GridSearchCV(reg_lgbm, param_grid=parameters, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1) gs_model = gs.fit(X_train, y_train)
print('最优分数:{}'.format(gs_model.best_score_)) print('最优参数:{}'.format(gs_model.best_params_)) print('最优模型:{}'.format(gs_model.best_estimator_))
最优分数:-500.9400071901773 最优参数:{'max_depth': 7, 'num_leaves': 45} 最优模型:LGBMRegressor(max_depth=7, n_estimators=2000, num_leaves=45, objective='regression_l1', random_state=42)
# xgboost模型调参 learning_rate xgb_scores = [] time0 = time() for i in np.arange(0.05,0.31,0.05) reg_xgb = xgb.XGBRegressor(n_estimators=200, learning_rate=i) xgb_score = cross_val_score(reg_xgb, X_train, y_train, cv=3, scoring='neg_mean_absolute_error').mean() xgb_scores.append(xgb_score) print(time() - time0) print(max(xgb_scores)) print(np.arange(0.05,0.31,0.05)[np.argmax(xgb_scores)]) plt.figure(figsize=(8,6)) plt.plot(np.arange(0.05,0.31,0.05), xgb_scores)
132.54270577430725 268.7969219684601 402.8747355937958 543.0700986385345 673.9736497402191 807.1916081905365 -559.3930964745617 0.15000000000000002
# xgboost模型调参 max_depth xgb_scores = [] time0 = time() for i in np.arange(5,12,1): reg_xgb = xgb.XGBRegressor(n_estimators=200, learning_rate=0.15, max_depth=i) xgb_score = cross_val_score(reg_xgb, X_train, y_train, cv=3, scoring='neg_mean_absolute_error').mean() xgb_scores.append(xgb_score) print(time() - time0, xgb_score) print(max(xgb_scores)) print(np.arange(5,12,1)[np.argmax(xgb_scores)]) plt.figure(figsize=(8,6)) plt.plot(np.arange(5,12,1), xgb_scores)
106.13332343101501 -581.7825425249935 233.66106414794922 -559.3930964745617 386.93016719818115 -545.8423732084185 577.3439819812775 -540.0337358052888 789.385425567627 -535.3663493749481 1027.0641367435455 -537.2171228026253 1293.055543422699 -540.2481429747703 -535.3663493749481 9
# xgboost模型调参 colsample_bytree xgb_scores = [] time0 = time() for i in np.arange(0.4,0.8,0.1): reg_xgb = xgb.XGBRegressor(n_estimators=200, learning_rate=0.15, max_depth=9, colsample_bytree=i) xgb_score = cross_val_score(reg_xgb, X_train, y_train, cv=3, scoring='neg_mean_absolute_error').mean() xgb_scores.append(xgb_score) print(time() - time0, xgb_score) print(max(xgb_scores)) print(np.arange(0.4,0.8,0.1)[np.argmax(xgb_scores)]) plt.figure(figsize=(8,6)) plt.plot(np.arange(0.4,0.8,0.1), xgb_scores)
101.24388527870178 -540.0945292119669 219.8485279083252 -536.1881194847441 358.25814485549927 -534.7100199133007 509.0920376777649 -534.7369636623599 -534.7100199133007 0.6
# xgboost模型调参 colsample_bylevel xgb_scores = [] time0 = time() for i in np.arange(0.5,1.1,0.1): reg_xgb = xgb.XGBRegressor(n_estimators=200, learning_rate=0.15, max_depth=9, colsample_bytree=0.6, colsample_bylevel=i) xgb_score = cross_val_score(reg_xgb, X_train, y_train, cv=3, scoring='neg_mean_absolute_error').mean() xgb_scores.append(xgb_score) print(time() - time0, xgb_score) print(max(xgb_scores)) print(np.arange(0.5,1.1,0.1)[np.argmax(xgb_scores)]) plt.figure(figsize=(8,6)) plt.plot(np.arange(0.5,1.1,0.1), xgb_scores)
82.85519623756409 -534.1242236725466 176.23594546318054 -535.4707890065283 279.6718213558197 -534.5832091972042 391.7265202999115 -533.988677477093 518.4175012111664 -533.3711266578522 656.7450432777405 -534.7100199133007 657.538763999939 nan -533.3711266578522 1.0999999999999999
# 导入sklearn自带的模型融合库 from sklearn.ensemble import StackingRegressor
# 实例化模型 reg_lgbm = LGBMRegressor(max_depth=7, n_estimators=2000, num_leaves=45, objective='regression_l1', random_state=42) reg_xgb = xgb.XGBRegressor(n_estimators=200, learning_rate=0.15, max_depth=9, colsample_bytree=0.6, colsample_bylevel=0.9)
#进行模型融合 estimators=[('lgbm',reg_lgbm), ('xgb',reg_xgb)] sr = StackingRegressor(estimators, verbose=True) sr_scores = cross_val_score(sr, X_train, y_train, cv=3, scoring='neg_mean_absolute_error')
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 1.2min finished [Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 2.9min finished [Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 1.2min finished [Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 2.8min finished [Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 1.3min finished [Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 2.8min finished
# 查看模型融合分数 sr_scores
array([-491.53751775, -494.7186037 , -486.69418657])
# 导出预测结果 sr.fit(X_train, y_train) sr_predict = sr.predict(X_test) pd.Dataframe(sr_predict).to_csv('stack_submit.csv')
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 1.8min finished [Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 4.1min finished
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)