本系列博客基于温州大学黄海广博士的机器学习课程的笔记,小伙伴们想更详细学习黄博士课程请移步到黄博士的Github、或者机器学习初学者公众号,现在在中国慕课也是可以学习的,内容包括机器学习、深度学习及Python编程,matplotlib、numpy、pandas、sklearn等,资料很详细,要系统学习请移步哦!笔者的博客只是笔记,内容不会十分详细,甚至会有些少错误!
回归理论内容:
逻辑回归.
实战的数据集下载.
实战1:构建一个逻辑回归模型,预测某个学生是否被大学录取;
"""
任务:
构建一个逻辑回归模型,预测某个学生是否被大学录取;
叙述:
大学录取学生相关部分管理者,通过申请学生两次测试的评分,决定他们是否被录取;
现拥有之前申请学生的可以用于训练逻辑回归模型的训练样本集;对于每个训练样本,
有他们两次测试的评分和最后是否被录取的结果;
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data_path = "ex2data1.txt"
data = pd.read_csv(data_path, header = None, names = ["Exam 1", "Exam 2", "Admitted"])
data.head(10)
#print("数据形状:", data.shape)
# 数据内容:
Exam 1 Exam 2 Admitted
0 34.623660 78.024693 0
1 30.286711 43.894998 0
2 35.847409 72.902198 0
3 60.182599 86.308552 1
4 79.032736 75.344376 1
5 45.083277 56.316372 0
6 61.106665 96.511426 1
7 75.024746 46.554014 1
8 76.098787 87.420570 1
9 84.432820 43.533393 1
# 可视化原始数据
positive = data[data["Admitted"].isin([1])]
negative = data[data["Admitted"].isin([0])]
fig, ax = plt.subplots(figsize = (12, 8))
ax.scatter(positive["Exam 1"],
positive["Exam 2"],
s = 50,
c = "b",
marker = "o",
label = "Admitted")
ax.scatter(negative["Exam 1"],
negative["Exam 2"],
s = 50,
c = "r",
marker = "x",
label = "Not Admitted")
ax.legend()
ax.set_xlabel("Exam 1 Score")
ax.set_ylabel("Exam 2 Score")
plt.show()
- sigmoid函数
g ( z ) = 1 1 + e − z g(z)=\frac{1}{1+e^{-z}} g(z)=1+e−z1 - 假设函数
h ( x ) = 1 1 + e − ω T x h(x)=\frac{1}{1+e^{-\omega^Tx}} h(x)=1+e−ωTx1 - 代价函数
J ( ω ) = − 1 m ∑ i = 1 m ( y ( i ) log ( h ( x ( i ) ) ) + ( 1 − y ( i ) ) log ( 1 − h ( x ( i ) ) ) ) J(\omega)=-\frac{1}{m}\sum_{i=1}^m(y^{(i)}\log(h(x^{(i)}))+(1-y^{(i)})\log(1-h(x^{(i)}))) J(ω)=−m1i=1∑m(y(i)log(h(x(i)))+(1−y(i))log(1−h(x(i))))
def sigmoid(z):
return 1 / (1 + np.exp(-z))
nums = np.arange(-6, 6)
fig, ax = plt.subplots(figsize = (12, 8))
ax.plot(nums, sigmoid(nums), "b")
plt.show()
def cost(w, X, y):
w = np.matrix(w)
X = np.matrix(X)
y = np.matrix(y)
first = np.multiply(-y, np.log(sigmoid(X * w.T)))
second = np.multiply((1 - y), np.log(1 - sigmoid(X * w.T)))
return np.sum(first - second) / (len(X))
#data.insert(0, "Ones", 1)
cols = data.shape[1]
X = data.iloc[:, 0:cols - 1]
y = data.iloc[:, cols - 1:cols]
X = np.array(X.values)
y = np.array(y.values)
w = np.zeros(3)
print("X.shape:", X.shape)
print("y.shape:", y.shape)
print("w.shape:", w.shape)
# 计算初始化参数代价函数
print("初始化参数代价函数:", cost(w, X, y))
# 结果输出:
# X.shape: (100, 3)
# y.shape: (100, 1)
# w.shape: (3,)
# 初始化参数代价函数: 0.6931471805599453
- 批量梯度下降(batch gradient descent)
- 转化为向量化计算:
∂ J ( ω ) ∂ ω j = 1 m ∑ i = 1 m ( h ( x ( i ) ) − y ( i ) ) x j ( i ) \frac{\partial{J(\omega)}}{\partial\omega_j}=\frac{1}{m}\sum_{i=1}^m(h(x^{(i)})-y^{(i)})x_j^{(i)} ∂ωj∂J(ω)=m1i=1∑m(h(x(i))−y(i))xj(i)
def gradient(w, X, y):
w = np.matrix(w)
X = np.matrix(X)
y = np.matrix(y)
parameters = int(w.ravel().shape[1])
grad = np.zeros(parameters)
error = sigmoid(X * w.T) - y
for i in range(parameters):
term = np.multiply(error, X[:, i])
grad[i] = np.sum(term) / len(X)
return grad
print("gradient:", gradient(w, X, y))
# 使用TNC寻找最优参数
import scipy.optimize as opt
result = opt.fmin_tnc(func = cost, x0 = w, fprime = gradient, args = (X, y))
print("result:", result)
print("cost:", cost(result[0], X, y))
# 用所学的参数w来为数据集X输出预测,使用这个函数给分类器训练精度打分
# 如果假设函数大于等于0.5,预测y=1;如果假设函数小于0.5,预测y=0;
def predict(w, X):
probability = sigmoid(X * w.T)
return [1 if x >= 0.5 else 0 for x in probability]
w_min = np.matrix(result[0])
predictions = predict(w_min, X)
correct = [
1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0
for (a, b) in zip(predictions, y)
]
accuracy = (sum(map(int, correct)) % len(correct))
print("accuracy = {0}%".format(accuracy))
# 结果输出:
# gradient: [ -0.1 -12.00921659 -11.26284221]
# result: (array([-25.16131872, 0.20623159, 0.20147149]), 36, 0)
# cost: 0.20349770158947425
# accuracy = 89%
2.实战1完整代码
"""
任务:
构建一个逻辑回归模型,预测某个学生是否被大学录取;
叙述:
大学录取学生相关部分管理者,通过申请学生两次测试的评分,决定他们是否被录取;
现拥有之前申请学生的可以用于训练逻辑回归模型的训练样本集;对于每个训练样本,
有他们两次测试的评分和最后是否被录取的结果;
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.optimize as opt
data_path = "ex2data1.txt"
data = pd.read_csv(data_path, header = None, names = ["Exam 1", "Exam 2", "Admitted"])
positive = data[data["Admitted"].isin([1])]
negative = data[data["Admitted"].isin([0])]
fig, ax = plt.subplots(figsize = (12, 8))
ax.scatter(positive["Exam 1"],
positive["Exam 2"],
s = 50,
c = "b",
marker = "o",
label = "Admitted")
ax.scatter(negative["Exam 1"],
negative["Exam 2"],
s = 50,
c = "r",
marker = "x",
label = "Not Admitted")
ax.legend()
ax.set_xlabel("Exam 1 Score")
ax.set_ylabel("Exam 2 Score")
plt.show()
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def cost(w, X, y):
w = np.matrix(w)
X = np.matrix(X)
y = np.matrix(y)
first = np.multiply(-y, np.log(sigmoid(X * w.T)))
second = np.multiply((1 - y), np.log(1 - sigmoid(X * w.T)))
return np.sum(first - second) / (len(X))
def gradient(w, X, y):
w = np.matrix(w)
X = np.matrix(X)
y = np.matrix(y)
parameters = int(w.ravel().shape[1])
grad = np.zeros(parameters)
error = sigmoid(X * w.T) - y
for i in range(parameters):
term = np.multiply(error, X[:, i])
grad[i] = np.sum(term) / len(X)
return grad
# 用所学的参数w来为数据集X输出预测,使用这个函数给分类器训练精度打分
# 如果假设函数大于等于0.5,预测y=1;如果假设函数小于0.5,预测y=0;
def predict(w, X):
probability = sigmoid(X * w.T)
return [1 if x >= 0.5 else 0 for x in probability]
data.insert(0, "Ones", 1)
cols = data.shape[1]
# iloc[:,:]:第一个冒号为取行数,第二个冒号为取列数;
X = data.iloc[:, 0:cols - 1]
y = data.iloc[:, cols - 1:cols]
X = np.array(X.values)
y = np.array(y.values)
w = np.zeros(3)
# 使用TNC寻找最优参数
result = opt.fmin_tnc(func = cost, x0 = w, fprime = gradient, args = (X, y))
print("gradient:", gradient(w, X, y))
print("result:", result)
print("cost:", cost(result[0], X, y))
w_min = np.matrix(result[0])
predictions = predict(w_min, X)
correct = [
1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0
for (a, b) in zip(predictions, y)
]
accuracy = (sum(map(int, correct)) % len(correct))
print("accuracy = {0}%".format(accuracy))
3.实战2:正则化逻辑回归
"""
需求:
训练一个测试模型,测试芯片是否合格,从而决定芯片要被接收或抛弃;
背景:
现你是某工厂生产主管,现有一些芯片在两次测试中的测试结果;
构建一个逻辑回归模型,决定这些芯片是被接收还是被抛弃;
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data_path2 = "ex2data2.txt"
data2 = pd.read_csv(data_path2, header = None, names = ["Test 1","Test 2", "Accepted"])
data2.head(10)
# isin():接受一个列表,判断该列中元素是否在列表中;
positive = data2[data2["Accepted"].isin([1])]
negative = data2[data2["Accepted"].isin([0])]
#print(positive)
#print(negative)
fig, ax = plt.subplots(figsize = (12, 8))
ax.scatter(positive["Test 1"],
positive["Test 2"],
s = 50,
c = "b",
marker = "o",
label = "Accepted")
ax.scatter(negative["Test 1"],
negative["Test 2"],
s = 50,
c = "g",
marker = "x",
label = "Not Accepted")
ax.legend()
ax.set_xlabel("Test 1 Score", fontsize = 16)
ax.set_ylabel("Test 2 Score", fontsize = 16)
plt.show()
degree = 5
x1 = data2["Test 1"]
x2 = data2["Test 2"]
data2.insert(3, "Ones", 1)
for i in range(1, degree):
for j in range(0, i):
data2["F" + str(i) + str(j)] = np.power(x1, i - j) * np.power(x2, j)
data2.drop("Test 1", axis = 1, inplace = True)
data2.drop("Test 2", axis = 1, inplace = True)
data2.head(10)
# 经过处理后,数据内容:
Accepted Ones F10 F20 F21 F30 F31 F32 F40 F41 F42 F43
0 1 1 0.051267 0.002628 0.035864 0.000135 0.001839 0.025089 6.907989e-06 0.000094 0.001286 0.017551
1 1 1 -0.092742 0.008601 -0.063523 -0.000798 0.005891 -0.043509 7.397855e-05 -0.000546 0.004035 -0.029801
2 1 1 -0.213710 0.045672 -0.147941 -0.009761 0.031616 -0.102412 2.085928e-03 -0.006757 0.021886 -0.070895
3 1 1 -0.375000 0.140625 -0.188321 -0.052734 0.070620 -0.094573 1.977539e-02 -0.026483 0.035465 -0.047494
4 1 1 -0.513250 0.263426 -0.238990 -0.135203 0.122661 -0.111283 6.939303e-02 -0.062956 0.057116 -0.051818
5 1 1 -0.524770 0.275384 -0.110097 -0.144513 0.057775 -0.023098 7.583610e-02 -0.030319 0.012121 -0.004846
6 1 1 -0.398040 0.158436 -0.013675 -0.063064 0.005443 -0.000470 2.510192e-02 -0.002167 0.000187 -0.000016
7 1 1 -0.305880 0.093563 0.058805 -0.028619 -0.017987 -0.011305 8.753955e-03 0.005502 0.003458 0.002173
8 1 1 0.016705 0.000279 -0.006753 0.000005 -0.000113 0.002730 7.787282e-08 -0.000002 0.000046 -0.001103
9 1 1 0.131910 0.017400 -0.067787 0.002295 -0.008942 0.034835 3.027686e-04 -0.001180 0.004595 -0.017901
- 正则化代价函数
J ( w ) = 1 m ∑ i = 1 m [ − y ( i ) log ( h ( x ( i ) ) ) − ( 1 − y ( i ) ) log ( 1 − h ( x ( i ) ) ) ] + λ 2 m ∑ j = 1 n w j 2 J\left( w \right)=\frac{1}{m}\sum\limits_{i=1}^{m}{[-{{y}^{(i)}}\log \left( {{h}}\left( {{x}^{(i)}} \right) \right)-\left( 1-{{y}^{(i)}} \right)\log \left( 1-{{h}}\left( {{x}^{(i)}} \right) \right)]}+\frac{\lambda }{2m}\sum\limits_{j=1}^{n}{w _{j}^{2}} J(w)=m1i=1∑m[−y(i)log(h(x(i)))−(1−y(i))log(1−h(x(i)))]+2mλj=1∑nwj2
# 正则化代价函数
def costReg(w, X, y, learningRate):
w = np.matrix(w)
X = np.matrix(X)
y = np.matrix(y)
first = np.multiply(-y, np.log(sigmoid(X * w.T)))
second = np.multiply((1 - y), np.log(1- sigmoid(X * w.T)))
reg = (learningRate / (2 * len(X))) * np.sum(np.power(w[:,1:w.shape[1]], 2))
return np.sum(first - second) / len(X) + reg
# 梯度下降算法更新w
def gradientReg(w, X, y, learningRate):
w = np.matrix(w)
X = np.matrix(X)
y = np.matrix(y)
parameters = int(w.ravel().shape[1])
grad = np.zeros(parameters)
error = sigmoid(X * w.T) - y
for i in range(parameters):
term = np.multiply(error, X[:, i])
if (i == 0):
grad[i] = np.sum(term) / len(X)
else:
grad[i] = (np.sum(term) / len(X)) + ((learningRate / len(X)) * w[:, i])
return grad
# 初始化变量
cols = data2.shape[1]
X2 = data2.iloc[:, 1:cols]
y2 = data2.iloc[:, 0:1]
X2 = np.array(X2.values)
y2 = np.array(y2.values)
w2 = np.zeros(11)
#learningRate = 1
learningRate = 0.001
print("costReg:", costReg(w2, X2, y2, learningRate))
print("gradientReg:", gradientReg(w2, X2, y2, learningRate))
import scipy.optimize as opt
result2 = opt.fmin_tnc(func = costReg, x0 = w2, fprime = gradientReg, args = (X2, y2, learningRate))
print("result2:", result2)
w_min = np.matrix(result2[0])
predictions = predict(w_min, X2)
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions, y2)]
accuracy = (sum(map(int, correct)) % len(correct))
print("accuracy = {0}%".format(accuracy))
# 结果输出:
costReg: 0.6931471805599454
gradientReg: [0.00847458 0.01878809 0.05034464 0.01150133 0.01835599 0.00732393
0.00819244 0.03934862 0.00223924 0.01286005 0.00309594]
result2: (array([ 0.46301664, -1.75159965, 13.85742167, -5.5806181 ,
12.26646198, 13.79968808, 4.41621766, -32.40960469,
-8.171413 , -37.77925962, 1.52172455]), 89, 1)
accuracy = 91%
4.实战2完整代码
"""
需求:
训练一个测试模型,测试芯片是否合格,从而决定芯片要被接收或抛弃;
背景:
现你是某工厂生产主管,现有一些芯片在两次测试中的测试结果;
构建一个逻辑回归模型,决定这些芯片是被接收还是被抛弃;
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.optimize as opt
data_path2 = "ex2data2.txt"
data2 = pd.read_csv(data_path2, header = None, names = ["Test 1","Test 2", "Accepted"])
data2.head(10)
# isin():接受一个列表,判断该列中元素是否在列表中;
positive = data2[data2["Accepted"].isin([1])]
negative = data2[data2["Accepted"].isin([0])]
fig, ax = plt.subplots(figsize = (12, 8))
ax.scatter(positive["Test 1"],
positive["Test 2"],
s = 50,
c = "b",
marker = "o",
label = "Accepted")
ax.scatter(negative["Test 1"],
negative["Test 2"],
s = 50,
c = "g",
marker = "x",
label = "Not Accepted")
ax.legend()
ax.set_xlabel("Test 1 Score", fontsize = 16)
ax.set_ylabel("Test 2 Score", fontsize = 16)
plt.show()
degree = 5
x1 = data2["Test 1"]
x2 = data2["Test 2"]
data2.insert(3, "Ones", 1)
for i in range(1, degree):
for j in range(0, i):
data2["F" + str(i) + str(j)] = np.power(x1, i - j) * np.power(x2, j)
# drop():删除DataFrame指定的行或列;
# label:"Test 1"带label标识的行或列;
# axis:1表示列,0表示行;
# inplace:True表示删除某行后原dataFrame变化,False不改变原始dataFrame;
data2.drop("Test 1", axis = 1, inplace = True)
data2.drop("Test 2", axis = 1, inplace = True)
def sigmoid(z):
return 1 / (1 + np.exp(-z))
# 正则化代价函数
def costReg(w, X, y, learningRate):
w = np.matrix(w)
X = np.matrix(X)
y = np.matrix(y)
first = np.multiply(-y, np.log(sigmoid(X * w.T)))
second = np.multiply((1 - y), np.log(1- sigmoid(X * w.T)))
reg = (learningRate / (2 * len(X))) * np.sum(np.power(w[:,1:w.shape[1]], 2))
return np.sum(first - second) / len(X) + reg
# 梯度下降算法更新w
def gradientReg(w, X, y, learningRate):
w = np.matrix(w)
X = np.matrix(X)
y = np.matrix(y)
parameters = int(w.ravel().shape[1])
grad = np.zeros(parameters)
error = sigmoid(X * w.T) - y
for i in range(parameters):
term = np.multiply(error, X[:, i])
if (i == 0):
grad[i] = np.sum(term) / len(X)
else:
grad[i] = (np.sum(term) / len(X)) + ((learningRate / len(X)) * w[:, i])
return grad
# 用所学的参数w来为数据集X输出预测,使用这个函数给分类器训练精度打分
# 如果假设函数大于等于0.5,预测y=1;如果假设函数小于0.5,预测y=0;
def predict(w, X):
probability = sigmoid(X * w.T)
return [1 if x >= 0.5 else 0 for x in probability]
# 初始化变量
cols = data2.shape[1]
X2 = data2.iloc[:, 1:cols]
y2 = data2.iloc[:, 0:1]
X2 = np.array(X2.values)
y2 = np.array(y2.values)
w2 = np.zeros(11)
#learningRate = 1
learningRate = 0.001
print("costReg:", costReg(w2, X2, y2, learningRate))
print("gradientReg:", gradientReg(w2, X2, y2, learningRate))
result2 = opt.fmin_tnc(func = costReg, x0 = w2, fprime = gradientReg, args = (X2, y2, learningRate))
print("result2:", result2)
w_min = np.matrix(result2[0])
predictions = predict(w_min, X2)
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions, y2)]
accuracy = (sum(map(int, correct)) % len(correct))
print("accuracy = {0}%".format(accuracy))
5.使用sklearn构建模型
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
data_path = "ex2data1.txt"
data = pd.read_csv(data_path, header = None, names = ["Exam 1", "Exam 2", "Admitted"])
positive = data[data["Admitted"].isin([1])]
negative = data[data["Admitted"].isin([0])]
fig, ax = plt.subplots(figsize = (12, 8))
ax.scatter(positive["Exam 1"],
positive["Exam 2"],
s = 50,
c = "b",
marker = "o",
label = "Admitted")
ax.scatter(negative["Exam 1"],
negative["Exam 2"],
s = 50,
c = "r",
marker = "x",
label = "Not Admitted")
ax.legend()
ax.set_xlabel("Exam 1 Score")
ax.set_ylabel("Exam 2 Score")
plt.show()
data.insert(0, "Ones", 1)
cols = data.shape[1]
# iloc[:,:]:第一个冒号为取行数,第二个冒号为取列数;
X = data.iloc[:, 0:cols - 1]
y = data.iloc[:, cols - 1:cols]
X = np.array(X.values)
y = np.array(y.values)
w = np.zeros(3)
model = linear_model.LogisticRegression(penalty = "l2", C = 1.0)
model.fit(X, y.ravel())
print(model.score(X, y))
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)