原文在这里
''' 这部分代码是lwls.py文件,基于随机梯度下降方法, 这与CSDN大部分相关文章求解正规方程(中间需要求逆)的方法不同, 这里不需要求矩阵的逆。 但是,它带来的另外一个问题是如何调参,如何找到最好的参数………… 我暂时还没找到好方法…… 欢迎留言评论 ''' # Locally Weighted Linear Regression is very inefficient because Parameters are calculated again for each test case # But, it should give good results after tuning the hyper-parameter tau import csv import math import numpy def converge(t): for i in t: if abs(i) > epsilon: return False return True def stochastic_gradient_descent(w,theta): for _ in range(max_n): for i in range(len(X_s)): x = numpy.array(X_s[i]) t = [0]*len(theta) for j in range(len(theta)): t[j] = alpha*w[i]*(Y_s[i]-numpy.dot(numpy.array(theta),x))*x[j] for j in range(len(theta)): theta[j] = theta[j] + t[j] # print(theta) # if converge(t): # return theta return theta def get_data(name): data = [] with open(name, 'r') as csv_file: reader = csv.reader(csv_file) for row in reader: for i in range(len(row)): row[i] = float(row[i]) data = data + [row] return data # 返回一个二维列表,里面的一个列表表示一个样本,样本不含常数项(1),含y值标签,且该标签在各样本的末尾处 def arrange_data(data): #把data文件中样本集(含特征和标签),划分成特征、标签,并将其返回 Xs = [[]]*len(data) # 二维列表 Ys = [[]]*len(data) for i in range(len(data)): Xs[i] = data[i][:-1]+[1] # 添加常数项 Ys[i] = data[i][-1] return Xs,Ys def weight(x_i,x): # 计算局部测试数据点特征x,与,训练数据的特征x_i,的权重 x_i = numpy.array(x_i) x = numpy.array(x) temp = x_i-x temp = numpy.dot(temp,temp) return math.exp(-1.0*temp/(2*tau*tau)) def get_weights(Xs,x): # 计算局部测试数据点特征x,与,整个训练集特征Xs,的权重,即该测试点的权重 # return [1]*len(Xs) # Uncomment If you want standard Linear Regression weights = [0]*len(Xs) for i in range(len(weights)): weights[i] = weight(Xs[i],x) return weights def get_parameters(w,n): # 计算参数theta,以备后续get_prediction theta = [0]*n theta = stochastic_gradient_descent(w,theta) # 使用随机梯度下降更新每个测试数据点的参数,这里是不同于大多数代码的地方! # print(theta) return theta def get_prediction(w,x): theta = get_parameters(w,len(x)) prediction = numpy.dot(numpy.array(theta),numpy.array(x)) return prediction data_train = get_data('hw_data_train.csv') data_test = get_data('hw_data_test.csv') X_s,Y_s = arrange_data(data_train) Xts,Yts = arrange_data(data_test) # (HYPER-)PARAMETERS tau = 0.1 # Weight Parameter alpha = 0.01 # Learning Rate max_n = 1000 # Stochastic Gradient Descent Loops epsilon = 0.0001 # Stochastic Gradient Descent Tolerance [not using here, though] variance = float(0) for i in range(len(Xts)): x = Xts[i] y = Yts[i] w = get_weights(X_s,x) prediction = get_prediction(w,x) print("Actual: " + str(y) + " Predicted: " + str(prediction)) variance = variance + (prediction-y)**2 variance = variance/len(Xts) print("Variance: ",variance)
# 这部分用来生成训练和测试数据 import numpy num_train = 100 num_test = 20 with open("data_train.csv",'a') as file: x = -5 for i in range(num_train): y = (x)**2 + 0*numpy.random.normal(0,1) file.write(str(x)+","+str(y)+"n") x = x + 10/num_train with open("data_test.csv",'a') as file: x = -5 for i in range(num_test): y = (x)**2 + 0*numpy.random.normal(0,1) file.write(str(x)+","+str(y)+"n") x = x + 10/num_test
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)