本模型的思路为使用t-SNE降维,使用DBSCAN降噪,GASVR进行预测。
在一定程度上具有参考价值,但因除噪时并未填补会导致数据中断,故在时间序列预测上并不具有实际实用价值,需要进行数据填补。且本模型只强调模型的精度,并不考虑数据之间的关联性,故原数据集将被打散。
1.引入数据from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report from sklearn.metrics import explained_variance_score from sklearn.ensemble import RandomForestRegressor from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.cluster import KMeans from sklearn.cluster import DBSCAN import numpy as np import time from sklearn import metrics import csv data=[] traffic_feature=[] traffic_target=[] csv_file = csv.reader(open('dongyinghaishui(10m)1.csv')) for content in csv_file: content=list(map(float,content)) if len(content)!=0: data.append(content) traffic_feature.append(content[0:4]) traffic_target.append(content[-1]) #print('data=',data) #print('traffic_feature=',traffic_feature) #print('traffic_target=',traffic_target) data=np.array(data) traffic_feature=np.array(traffic_feature) traffic_target=np.array(traffic_target) data.shape
数据格式:(3977, 5)
2.标准化scaler = StandardScaler() # 标准化转换 scaler.fit(traffic_feature) # 训练标准化对象 traffic_feature= scaler.transform(traffic_feature) # 转换数据集 feature_train, feature_test, target_train, target_test = train_test_split(traffic_feature, traffic_target, test_size=0.05,random_state=17)3.测试单SVR效果
from sklearn.svm import SVR import matplotlib.pyplot as plt start1=time.time() model_svr = SVR(C=1,epsilon=0.1,gamma=10) model_svr.fit(feature_train,target_train) predict_results1=model_svr.predict(feature_test) end1=time.time() SVRRESULT=predict_results1 plt.plot(target_test)#测试数组 plt.plot(predict_results1)#测试数组 plt.legend(['True','SVR']) fig = plt.gcf() fig.set_size_inches(18.5, 10.5) plt.title("SVR") # 标题 plt.show() print("EVS:",explained_variance_score(target_test,predict_results1)) print("R2:",metrics.r2_score(target_test,predict_results1)) print("Time:",end1-start1)
EVS: 0.4797244273713823 R2: 0.46414774637696155 Time: 0.5296137332916264.使用PCA确定降维价数
from sklearn.decomposition import PCA pca = PCA(n_components=4) newX = pca.fit_transform(traffic_feature) print(np.around(pca.explained_variance_ratio_, 3)) x_data = ['1','2','3','4'] y_data = np.around(pca.explained_variance_ratio_, 3) # 绘图 plt.bar(x=x_data, height=y_data,color='steelblue', alpha=0.8) fig = plt.gcf() fig.set_size_inches(18.5, 10.5) plt.title("Contribution rate of each principal component") # 为两条坐标轴设置名称 plt.xlabel("Principal Component") plt.ylabel("Contribution rate of each principal component/%") print(np.around(pca.explained_variance_ratio_, 3))
[0.467 0.251 0.181 0.101]
4价比重已占90%以上,故降维至4价即可。
5.t-SNE三维压缩from sklearn.manifold import TSNE newData = TSNE(n_components=3,random_state=17).fit_transform(newX) plt.scatter(newData[:, 0],newData[:, 1],newData[:, 2]) plt.title("PCA-TSNE") # 标题 fig = plt.gcf() fig.set_size_inches(18.5, 10.5) plt.show()6.分别引入kmeans和DBSCAN做降噪对比
#Kmeans t0 = time.time() kmeans = KMeans(init = 'k-means++',n_clusters=10, random_state=10).fit(newData) t = time.time() - t0 plt.scatter(newData[:, 0], newData[:, 1],newData[:, 2],c=kmeans.labels_) plt.title('Kmeans time : %f'%t) fig = plt.gcf() fig.set_size_inches(18.5, 10.5) plt.show()
看得出,并不能很好的识别出噪音点。
#DBSCAN t0 = time.time() dbscan = DBSCAN(eps=0.1,min_samples=2).fit(newData) t = time.time()-t0 plt.scatter(newData[:, 0], newData[:, 1],newData[:, 2],c=dbscan.labels_) plt.title('DBSCAN time: %f'%t) fig = plt.gcf() fig.set_size_inches(18.5, 10.5) plt.show()
孤立的噪音较多。
7.使用DBSCAN降噪#DBSCAN 降噪后 dbscan = DBSCAN(eps=0.1,min_samples=2).fit(newData) YYY=[] XXX=[] C=[] for inx,i in enumerate(dbscan.labels_): if i==-1: YYY.append(i) XXX.append(newData[inx]) C.append(inx) XXX=np.array(XXX) XXX=XXX.astype(np.float64) plt.scatter(XXX[:, 0], XXX[:, 1], XXX[:, 2],c=YYY) fig = plt.gcf() fig.set_size_inches(18.5, 10.5) plt.title('DBSCAN After noise reduction time: %f'%t) plt.show()
降噪后的效果
8 .进行数据还原XXX CCC=[] DDD=[] a=1 for inx,i in enumerate(traffic_target): for j in C: if(inx==j): a=1+a CCC.append(i) DDD.append([a,i]) CCC=np.array(CCC) DDD=np.array(DDD) DDD.shape dbscan = DBSCAN(eps=1,min_samples=5).fit(DDD) plt.scatter(DDD[:,0],DDD[:,1],c=dbscan.labels_) plt.title('DBSCAN') fig = plt.gcf() fig.set_size_inches(18.5, 10.5) plt.show()
还原后的数据
9.将还原的数据进行SVR预测after_feature_train,after_feature_test,after_target_train, after_target_test = train_test_split(XXX,CCC,test_size=0.05,random_state=17) start=time.time() model_svr = SVR(C=1,epsilon=0.1,gamma=10) model_svr.fit(XXX,CCC) after_predict_result=model_svr.predict(after_feature_test) end=time.time() DBSCANSVRRESULT=after_predict_result plt.plot(after_predict_result)#测试数组 plt.plot(after_target_test)#测试数组 plt.title("DBSCAN-SVR") # 标题 plt.legend(['True','DBSCAN-SVR']) fig = plt.gcf() fig.set_size_inches(18.5, 10.5) plt.show() print("EVS:",explained_variance_score(after_target_test,after_predict_result)) print("R2:",metrics.r2_score(after_target_test,after_predict_result)) print("Time:",end-start)
效果如下
EVS: 0.8250398436521921 R2: 0.8018421163998288 Time: 0.26456522941589355
明显有了极大的提升,但还不够。
10.引入GAXXX = XXX.tolist() CCC = CCC.tolist() RRR=after_feature_test.tolist() LLL=after_target_test.tolist() def msefunc(predictval,realval): # squaredError = [] #absError = [] #for i in range(len(predictval)): # val=predictval[i-1]-realval[i-1] # squaredError.append(val * val) # target-prediction之差平方 print("R2 = ",metrics.r2_score(realval,predictval)) # R2 return metrics.r2_score(realval,predictval) def SVMResult(vardim, x, bound): X = XXX y = CCC c=x[0] e=x[1] g=x[2] clf = SVR(C=c,epsilon=e,gamma=g) clf.fit(X, y) predictval=clf.predict(RRR) return msefunc(predictval,LLL) class GAIndividual: ''' individual of genetic algorithm ''' def __init__(self, vardim, bound): ''' vardim: dimension of variables bound: boundaries of variables ''' self.vardim = vardim self.bound = bound self.fitness = 0. def generate(self): ''' generate a random chromsome for genetic algorithm ''' len = self.vardim rnd = np.random.random(size=len) self.chrom = np.zeros(len) for i in range(0, len): self.chrom[i] = self.bound[0, i] + (self.bound[1, i] - self.bound[0, i]) * rnd[i] def calculateFitness(self): ''' calculate the fitness of the chromsome ''' self.fitness = SVMResult(self.vardim, self.chrom, self.bound) import random import copy class GeneticAlgorithm: ''' The class for genetic algorithm ''' def __init__(self, sizepop, vardim, bound, MAXGEN, params): ''' sizepop: population sizepop人口规模 vardim: dimension of variables变量维数 bound: boundaries of variables变量边界 MAXGEN: termination condition终止条件 param: algorithm required parameters, it is a list which is consisting of crossover rate, mutation rate, alpha ''' self.sizepop = sizepop self.MAXGEN = MAXGEN self.vardim = vardim self.bound = bound self.population = [] self.fitness = np.zeros((self.sizepop, 1)) self.trace = np.zeros((self.MAXGEN, 3)) self.params = params def initialize(self): ''' initialize the population ''' for i in range(0, self.sizepop): ind = GAIndividual(self.vardim, self.bound) ind.generate() self.population.append(ind) def evaluate(self): ''' evaluation of the population fitnesses ''' for i in range(0, self.sizepop): self.population[i].calculateFitness() self.fitness[i] = self.population[i].fitness def solve(self): ''' evolution process of genetic algorithm ''' self.t = 0 self.initialize() self.evaluate() best = np.max(self.fitness) bestIndex = np.argmax(self.fitness) self.best = copy.deepcopy(self.population[bestIndex]) self.avefitness = np.mean(self.fitness) self.maxfitness = np.max(self.fitness) self.trace[self.t, 0] = self.best.fitness self.trace[self.t, 1] = self.avefitness self.trace[self.t, 2] = self.maxfitness print("Generation %d: optimal function value is: %f; average function value is %f;max function value is %f"% ( self.t, self.trace[self.t, 0], self.trace[self.t, 1],self.trace[self.t, 2])) while (self.t < self.MAXGEN - 1): self.t += 1 self.selectionOperation() self.crossoverOperation() self.mutationOperation() self.evaluate() best = np.max(self.fitness) bestIndex = np.argmax(self.fitness) if best > self.best.fitness: self.best = copy.deepcopy(self.population[bestIndex]) self.avefitness = np.mean(self.fitness) self.maxfitness = np.max(self.fitness) self.trace[self.t, 0] = self.best.fitness self.trace[self.t, 1] = self.avefitness self.trace[self.t, 2] = self.maxfitness print("Generation %d: optimal function value is: %f; average function value is %f;max function value is %f"% ( self.t, self.trace[self.t, 0], self.trace[self.t, 1],self.trace[self.t, 2])) print("Optimal function value is: %f; " % self.trace[self.t, 0]) print ("Optimal solution is:") print (self.best.chrom) self.printResult() def selectionOperation(self): ''' selection operation for Genetic Algorithm ''' newpop = [] totalFitness = np.sum(self.fitness) accuFitness = np.zeros((self.sizepop, 1)) sum1 = 0. for i in range(0, self.sizepop): accuFitness[i] = sum1 + self.fitness[i] / totalFitness sum1 = accuFitness[i] for i in range(0, self.sizepop): r = random.random() idx = 0 for j in range(0, self.sizepop - 1): if j == 0 and r < accuFitness[j]: idx = 0 break elif r >= accuFitness[j] and r < accuFitness[j + 1]: idx = j + 1 break newpop.append(self.population[idx]) self.population = newpop def crossoverOperation(self): ''' crossover operation for genetic algorithm ''' newpop = [] for i in range(0, self.sizepop, 2): idx1 = random.randint(0, self.sizepop - 1) idx2 = random.randint(0, self.sizepop - 1) while idx2 == idx1: idx2 = random.randint(0, self.sizepop - 1) newpop.append(copy.deepcopy(self.population[idx1])) newpop.append(copy.deepcopy(self.population[idx2])) r = random.random() if r < self.params[0]: crossPos = random.randint(1, self.vardim - 1) for j in range(crossPos, self.vardim): newpop[i].chrom[j] = newpop[i].chrom[ j] * self.params[2] + (1 - self.params[2]) * newpop[i + 1].chrom[j] newpop[i + 1].chrom[j] = newpop[i + 1].chrom[j] * self.params[2] + (1 - self.params[2]) * newpop[i].chrom[j] self.population = newpop def mutationOperation(self): ''' mutation operation for genetic algorithm ''' newpop = [] for i in range(0, self.sizepop): newpop.append(copy.deepcopy(self.population[i])) r = random.random() if r < self.params[1]: mutatePos = random.randint(0, self.vardim - 1) theta = random.random() if theta > 0.5: newpop[i].chrom[mutatePos] = newpop[i].chrom[ mutatePos] - (newpop[i].chrom[mutatePos] - self.bound[0, mutatePos]) * (1 - random.random() ** (1 - self.t / self.MAXGEN)) else: newpop[i].chrom[mutatePos] = newpop[i].chrom[ mutatePos] + (self.bound[1, mutatePos] - newpop[i].chrom[mutatePos]) * (1 - random.random() ** (1 - self.t / self.MAXGEN)) self.population = newpop def printResult(self): ''' plot the result of the genetic algorithm ''' x = np.arange(0, self.MAXGEN) y1 = self.trace[:, 0] y2 = self.trace[:, 1] y3 = self.trace[:, 2] #plt.plot(x, y1, 'r', label='optimal value') plt.plot(x, y2, 'g', label='average value') plt.plot(x, y3, 'b', label='max value') fig = plt.gcf() fig.set_size_inches(18.5, 10.5) plt.xlabel("GENS") plt.ylabel("R2") plt.title("GA") plt.legend() plt.show()
if __name__ == "__main__": bound = np.array([[0,0,0],[10,2,100]]) ga = GeneticAlgorithm(20, 3, bound, 20, [0.9, 0.1, 0.5]) ga.solve()11.将优化后的参数代入SVR
after_feature_train,after_feature_test,after_target_train, after_target_test = train_test_split(XXX,CCC,test_size=0.05,random_state=17) svr = SVR(C=3.9505118,epsilon=0.15014702,gamma=38.02146648) start=time.time() svr.fit(XXX,CCC) after_predict_result=svr.predict(after_feature_test) end=time.time() DBSCANSVRGARESULT=[] DBSCANSVRGARESULT=after_predict_result plt.plot(after_predict_result,marker='o')#测试数组 plt.plot(after_target_test,marker='x') plt.title("DBSCAN-SVR-GA") # 标题 plt.legend(['True','DBSCAN-SVR-GA']) fig = plt.gcf() fig.set_size_inches(18.5, 10.5) plt.show() print("EVS:",explained_variance_score(after_target_test,after_predict_result)) print("R2:",metrics.r2_score(after_target_test,after_predict_result)) print("Time:",end-start)
EVS: 0.9919232390408123 R2: 0.9919208479231462 Time: 0.32965993881225586
已经不能再好了吧...
12.只用GASVR做对比调优过程略...
start1=time.time() model_svr =SVR(C=3.9505118,epsilon=0.15014702,gamma=38.02146648) model_svr.fit(feature_train,target_train) predict_results1=model_svr.predict(feature_test) SVRGARESULT=[] SVRGARESULT=predict_results1 end1=time.time() plt.plot(target_test)#测试数组 plt.plot(predict_results1)#测试数组 fig = plt.gcf() fig.set_size_inches(18.5, 10.5) plt.title("SVR-GA") # 标题 plt.legend(['True','SVR-GA']) plt.show() print("EVS:",explained_variance_score(target_test,predict_results1)) print("R2:",metrics.r2_score(target_test,predict_results1)) print("Time:",end1-start1)
EVS: 0.6045876080406886 R2: 0.5994740318045189 Time: 0.8312788009643555
差的远呢...
13.做个BPNN为对比YY=[] for i in CCC: YY.append([i]) import pandas as pd import numpy as np from sklearn.preprocessing import MinMaxScaler import tensorflow.compat.v1 as tf tf.disable_v2_behavior() x=XXX y=YY x_t=np.array(x) y_true=np.array(y) mm=MinMaxScaler() #实例化 std=mm.fit(x_t) #训练模型 x_true=std.transform(x_t) #转化 print(x_true) print(y_true) X=tf.placeholder(tf.float32,[None,3]) Y=tf.placeholder(tf.float32,[None,1]) w1=tf.Variable(tf.truncated_normal([3,3],stddev=0.001)) b1=tf.Variable(tf.zeros([3])) w2=tf.Variable(tf.zeros([3,1])) b2=tf.Variable(tf.zeros([1])) L1=tf.nn.relu(tf.matmul(X,w1)+b1) y_pre=tf.matmul(L1,w2)+b2 loss=tf.reduce_mean(tf.cast(tf.square(Y-y_pre),tf.float32)) train_op=tf.train.GradientDescentOptimizer(0.1).minimize(loss) init_op=tf.global_variables_initializer() saver=tf.train.Saver() start1=time.time() with tf.Session() as sess: sess.run(init_op) R2=[] for i in range(1,2): #控制训练批次 TRUE=[] PRED=[] for j in range (len(y_true)):#控制每批次训练的样本数 sess.run(train_op,feed_dict={X:[x_true[j,:]],Y:[y_true[j,:]]})#[[]]是为了匹配占位的类型 print('第%s批次第%s个样本训练的损失为:%s,真实值为:%s,预测值为:%s'% (i,j+1,sess.run(loss, feed_dict={X:[x_true[j,:]],Y:[y_true[j,:]]}),y_true[j,:],sess.run(y_pre,feed_dict={X:[x_true[j,:]],Y:[y_true[j,:]]}))) TRUE.append(y_true[j,:].tolist()) PRED.append(sess.run(y_pre,feed_dict={X:[x_true[j,:]],Y:[y_true[j,:]]}).tolist()[0]) end1=time.time()
跑了一批
第1批次第1个样本训练的损失为:44.515533,真实值为:[8.34],预测值为:[[1.668004]] 第1批次第2个样本训练的损失为:22.748747,真实值为:[7.63],预测值为:[[2.8604357]] 第1批次第3个样本训练的损失为:19.92319,真实值为:[8.44],预测值为:[[3.9764595]] 第1批次第4个样本训练的损失为:2.6626563e-05,真实值为:[3.97],预测值为:[[3.9751601]] 第1批次第5个样本训练的损失为:38.861004,真实值为:[11.77],预测值为:[[5.536141]] 第1批次第6个样本训练的损失为:24.939627,真实值为:[11.8],预测值为:[[6.8060412]] 第1批次第7个样本训练的损失为:11.694864,真实值为:[11.2],预测值为:[[7.7802243]] 第1批次第8个样本训练的损失为:5.1589394,真实值为:[11.07],预测值为:[[8.79867]] 第1批次第9个样本训练的损失为:0.72628474,真实值为:[11.08],预测值为:[[10.227777]] 第1批次第10个样本训练的损失为:0.058379333,真实值为:[11.11],预测值为:[[11.351618]] 第1批次第11个样本训练的损失为:0.037325915,真实值为:[11.08],预测值为:[[10.886801]] 第1批次第12个样本训练的损失为:0.008513206,真实值为:[11.06],预测值为:[[11.152267]] 第1批次第13个样本训练的损失为:9.5740324e-05,真实值为:[11.01],预测值为:[[11.019785]] 第1批次第14个样本训练的损失为:1.72812e-05,真实值为:[10.99],预测值为:[[10.994157]] 第1批次第15个样本训练的损失为:0.033472456,真实值为:[10.88],预测值为:[[10.697045]] 第1批次第16个样本训练的损失为:0.00715772,真实值为:[10.87],预测值为:[[10.954603]] 第1批次第17个样本训练的损失为:0.00039784683,真实值为:[10.93],预测值为:[[10.910054]] 第1批次第18个样本训练的损失为:0.00052338885,真实值为:[10.96],预测值为:[[10.982878]] 第1批次第19个样本训练的损失为:0.0008995263,真实值为:[10.89],预测值为:[[10.860008]] 第1批次第20个样本训练的损失为:0.00014833717,真实值为:[10.82],预测值为:[[10.80782]] 中间省略... 第1批次第2663个样本训练的损失为:0.18928096,真实值为:[9.33],预测值为:[[8.894936]] 第1批次第2664个样本训练的损失为:0.09988854,真实值为:[9.29],预测值为:[[8.9739485]] 第1批次第2665个样本训练的损失为:1.451641,真实值为:[10.48],预测值为:[[9.275159]] 第1批次第2666个样本训练的损失为:1.4686363,真实值为:[10.79],预测值为:[[9.578127]] 第1批次第2667个样本训练的损失为:0.98703796,真实值为:[10.82],预测值为:[[9.826502]] 第1批次第2668个样本训练的损失为:1.0708082,真实值为:[11.12],预测值为:[[10.085201]] 第1批次第2669个样本训练的损失为:1.6277648,真实值为:[11.68],预测值为:[[10.404161]] 第1批次第2670个样本训练的损失为:1.009364,真实值为:[11.66],预测值为:[[10.655329]]14.BPNN效果
TRUEs=[] PREDs=[] for i in TRUE: TRUEs.append(i[0]) for i in PRED: PREDs.append(i) TRUEmax,TRUEmin,PREDsmax,PREDsmin = train_test_split(TRUEs,PREDs,test_size=0.05,random_state=17) plt.plot(TRUEmin)#测试数组 plt.plot(PREDsmin)#测试数组 fig = plt.gcf() fig.set_size_inches(18.5, 10.5) plt.title("DBSCAN-PBNN") # 标题 plt.legend(['True','PBNN']) plt.show() # MSE print(metrics.mean_squared_error(TRUEmin, PREDsmin)) # 8.107142857142858 # RMSE print(np.sqrt(metrics.mean_squared_error(TRUEmin, PREDsmin))) # 2.847304489713536 # MAE print(metrics.mean_absolute_error(TRUEmin, PREDsmin)) # 1.9285714285714286 # R2 print(metrics.r2_score(TRUEmin,PREDsmin)) #EVS print(explained_variance_score(TRUEmin,PREDsmin)) #TIME print(((end1-start1)/3600)*366)
0.11390565147753237 0.33749911329888316 0.2059761757637139 0.9660229469003566 0.9660237974708329 1.6406082673867544
非常优秀,但还是不够
15.做两次对比plt.plot(TRUEmin,marker='o')#测试数组 plt.plot(PREDsmin,marker='>')#测试数组 plt.plot(DBSCANSVRGARESULT,marker='D') plt.plot(DBSCANSVRRESULT,marker='s') fig = plt.gcf() fig.set_size_inches(18.5, 10.5) plt.title("COMPARE(After noise reduction)") # 标题 plt.legend(['True','DBSCAN-PBNN','DBSCAN-SVR-GA','DBSCAN-SVR']) plt.show()
plt.plot(target_test,marker='o')#测试数组 plt.plot(SVRGARESULT,marker='P') plt.plot(SVRRESULT,marker='D') fig = plt.gcf() fig.set_size_inches(18.5, 10.5) plt.title("COMPARE(Before noise reduction)") # 标题 plt.legend(['True','SVR-GA','SVR']) plt.show()
不考虑原数据集的改动问题,单纯的从预测角度来看,t-SNE-DBSCAN-GASVR效果是很的不错,希望能对你们有所启发。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)