import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from IPython.display import Image import os %matplotlib inline
notes:
- 使用 seaborn 进行图形化展示
# PIL的参数设置 # 设置界面的中文的字体 plt.rcParams['font.sans-serif'] = ['SimHei'] # 使用大家更加适应的方式显示负号 plt.rcParams['axes.unicode_minus'] = False # 设置输出图片的尺寸 plt.rcParams['figure.figsize'] = (10, 6)
# 读取训练数据集 file_path = os.path.join(os.getcwd(), '..', 'datasets') file_name = 'train.csv' file_url = os.path.join(file_path, file_name) data = pd.read_csv(file_url) data.head()
# 对数据缺失值进行补充 # 对分类的值进行填充 data['Cabin'] = data['Cabin'].fillna('NA') data['Embarked'] = data['Embarked'].fillna('S') # 对连续值进行填充 age_mean = data['Age'].mean() data['Age'] = data['Age'].fillna(age_mean) # 检查缺失值的比例 data.isnull().sum().sort_values(ascending=False)
PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 0 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 0 Embarked 0 dtype: int64
notes:
- SeriesObject.fillna(‘content’)
使用此函数填补 NAN 值
# 编码分类变量 data_feature_values = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch' , 'Fare', 'Embarked']] print(data_feature_values.head()) # 对分类的(离散型)的列进行数值转换 data_feature_values = pd.get_dummies(data_feature_values) print(data_feature_values.head())
Pclass Sex Age SibSp Parch Fare Embarked 0 3 male 22.0 1 0 7.2500 S 1 1 female 38.0 1 0 71.2833 C 2 3 female 26.0 0 0 7.9250 S 3 1 female 35.0 1 0 53.1000 S 4 3 male 35.0 0 0 8.0500 S Pclass Age SibSp Parch Fare Sex_female Sex_male Embarked_C 0 3 22.0 1 0 7.2500 0 1 0 1 1 38.0 1 0 71.2833 1 0 1 2 3 26.0 0 0 7.9250 1 0 0 3 1 35.0 1 0 53.1000 1 0 0 4 3 35.0 0 0 8.0500 0 1 0 Embarked_Q Embarked_S 0 0 1 1 0 0 2 0 1 3 0 1 4 0 1
notes:
- pd.get_dummies(df)
使用其可以对一些分数值的列进行转换
1.
from sklearn.model_selection import train_test_split X = data_feature_values y = data['Survived'] X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0) print(y.count(), y.sum(), y.sum()/y.count()) print(X_train.shape, X_test.shape)
891 342 0.3838383838383838 0.250280583613917 (668, 10) (223, 10)
notes:
- 使用 tain_test_split函数对数据集进行分割
- 可以使用 train_size=所在比例 进行划分也可以使用 test_size=所占比例 进行划分stratify=y, 将按照 y 中标签的比例进行划分,即,训练集和测试集中的标签占比结果和整个标签列标签占比相同
from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier
print('======== X_train, y_train ========') print(X_train.head()) print('-------------') print(y_train.head()) print('========= end ==========') # 使用逻辑回归进行训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 查看训练集和测试集得分值 print('Training set score: ', lr.score(X_train, y_train)) print('Testing set score: ', lr.score(X_test, y_test))
======== X_train, y_train ======== Pclass Age SibSp Parch Fare Sex_female Sex_male Embarked_C 671 1 31.0 1 0 52.000 0 1 0 417 2 18.0 0 2 13.000 1 0 0 634 3 9.0 3 2 27.900 1 0 0 323 2 22.0 1 1 29.000 1 0 0 379 3 19.0 0 0 7.775 0 1 0 Embarked_Q Embarked_S 671 0 1 417 0 1 634 0 1 323 0 1 379 0 1 ------------- 671 0 417 1 634 0 323 1 379 0 Name: Survived, dtype: int64 ========= end ========== Training set score: 0.8023952095808383 Testing set score: 0.7847533632286996 C:UsersWuDiXDesktoptemp12-winter-vacation12-01-dataWhaleEnv-datawhale-01libsite-packagessklearnlinear_model_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. Increase the number of iterations (max_iter) or scale the data as shown in: https://scikit-learn.org/stable/modules/preprocessing.html Please also refer to the documentation for alternative solver options: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression n_iter_i = _check_optimize_result(
- lr.score 返回的是 决定系数,决定系数的值是:
lr.fit()
- 函数中第一个参数是一个 df, 第二个 Series
pred = lr.predict(X_test) pred[:10]
array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0], dtype=int64)
#预测标签概率 pred_proba = lr.predict_proba(X_test) print(X_test.head()) pred_proba[:10]
Pclass Age SibSp Parch Fare Sex_female Sex_male Embarked_C 288 2 42.0 0 0 13.0000 0 1 0 869 3 4.0 1 1 11.1333 0 1 0 182 3 9.0 4 2 31.3875 0 1 0 684 2 60.0 1 1 39.0000 0 1 0 599 1 49.0 1 0 56.9292 0 1 1 Embarked_Q Embarked_S 288 0 1 869 0 1 182 0 1 684 0 1 599 0 0 array([[0.84995574, 0.15004426], [0.84233073, 0.15766927], [0.94909621, 0.05090379], [0.94252973, 0.05747027], [0.70411225, 0.29588775], [0.50580607, 0.49419393], [0.40320661, 0.59679339], [0.733531 , 0.266469 ], [0.87814836, 0.12185164], [0.87546022, 0.12453978]])
# 交叉验证 from sklearn.model_selection import cross_val_score lr = LogisticRegression(C=100) scores = cross_val_score(lr, X_train,y_train, cv=10) print(scores)
[0.82089552 0.7761194 0.82089552 0.79104478 0.85074627 0.86567164 0.73134328 0.86567164 0.75757576 0.6969697 ]
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)