原函数
df=pd.read_csv('UserBehavior.csv',header=None)处理之后
df = pd.concat((chunk for chunk in pd.read_csv('UserBehavior.csv.zip',header=None, chunksize = 10000)))
上面这段代码分段加载csv文件,不会造成内存不够用的情况,但是这个方法超级慢
pivot函数报错原函数
traindf=traindata.pivot(index='userid', columns='proid', values='num')处理之后
data = data1.drop(df[df['userid'] > 10000].index) print(data.shape) print(data.head(5)) traindata, testdata = train_test_split(data,test_size=0.2, random_state=1) traindata1 = traindata.groupby(['userid','proid'], as_index= False).count() testdata1 = testdata.groupby(['userid','proid'], as_index= False).count() print(traindata1.head(5)) print(testdata1.head(5)) traindf = traindata1.pivot(index = 'userid', columns= 'proid', values= 'num')
这里所作的处理仅仅是通过减小数据量达到pivot函数正常运行的目的。所用函数为drop,对userid大于10000的直接去除
一段示例代码import pandas as pd import numpy as np import math from sklearn.model_selection import train_test_split def prediction(df,userdf,Nn=15):#Nn邻居个数 corr=df.T.corr()#建立用户相似度的协方差矩阵 rats=userdf.copy() for usrid in userdf.index: dfnull=df.loc[usrid][df.loc[usrid].isnull()] usrv=df.loc[usrid].mean()#评价平均值 for i in range(len(dfnull)): nft=(df[dfnull.index[i]]).notnull() #获取邻居列表 if(Nn<=len(nft)): nlist=df[dfnull.index[i]][nft][:Nn] else: nlist=df[dfnull.index[i]][nft][:len(nft)] nlist=nlist[corr.loc[usrid,nlist.index].notnull()] nratsum=0 corsum=0 if(0!=nlist.size): nv=df.loc[nlist.index,:].T.mean()#邻居评价平均值 for index in nlist.index: ncor=corr.loc[usrid,index] nratsum+=ncor*(df[dfnull.index[i]][index]-nv[index]) corsum+=abs(ncor) if(corsum!=0): rats.at[usrid,dfnull.index[i]]= usrv + nratsum/corsum else: rats.at[usrid,dfnull.index[i]]= usrv else: rats.at[usrid,dfnull.index[i]]= 0 return rats #推荐 def recomm(df,userdf,Nn=15,TopN=1): ratings=prediction(df,userdf,Nn)#获取预测评分 recomm=[]#存放推荐结果 for usrid in userdf.index: #获取按NA值获取未评分项 ratft=userdf.loc[usrid].isnull() ratnull=ratings.loc[usrid][ratft] #对预测评分进行排序 if(len(ratnull)>=TopN): sortlist=(ratnull.sort_values(ascending=False)).index[:TopN] else: sortlist=ratnull.sort_values(ascending=False).index[:len(ratnull)] recomm.append(sortlist) return ratings,recomm # 读取数据 # df = pd.read_csv('UserBehavior.csv.zip',header=None) df = pd.concat((chunk for chunk in pd.read_csv('UserBehavior.csv.zip',header=None, chunksize = 10000))) #给数据加上列名称 df.columns = ['userid', 'num', 'proid', 'D','E'] print(df.head(5)) data1 = df.drop(['D','E'],axis=1) data = data1.drop(df[df['userid'] > 10000].index) print(data.shape) print(data.head(5)) traindata, testdata = train_test_split(data,test_size=0.2, random_state=1) traindata1 = traindata.groupby(['userid','proid'], as_index= False).count() testdata1 = testdata.groupby(['userid','proid'], as_index= False).count() print(traindata1.head(5)) print(testdata1.head(5)) traindf = traindata1.pivot(index = 'userid', columns= 'proid', values= 'num') print(traindf.head(5)) testdf = testdata1.pivot(index= 'userid', columns= 'proid', values= 'num') print(testdf.head(5)) traindf.rename(index={i:'usr%d'%(i) for i in traindf.index} , inplace=True) traindf.rename(columns={i:'pro%d'%(i) for i in traindf.columns} , inplace=True) testdf.rename(index={i:'usr%d'%(i) for i in testdf.index} , inplace=True) testdf.rename(columns={i:'pro%d'%(i) for i in testdf.columns} , inplace=True) userdf=traindf.loc[testdf.index] #获取预测评分和推荐列表 trainnums,trainrecomm=recomm(traindf,userdf) print(trainnums) print(trainrecomm)
代码段见资源student.csv
零零碎碎开发平台
anacondaspiderpycharm对于大数据的运算,前两个的运行速度要远高于第三个。当数据集过大,内存不能分配这么多空间的时候
相关链接
read_line = 10000 # 设置每次读取数据的行数 df.get_chunk(read_line)通过get_chunk方法只需要处理其中的一部分数据get_chunk
reader = pd.read_csv('some_data.csv', iterator=True) reader.get_chunk(100)
reader = pd.read_csv(’some_data.csv’, chunksize=100)相关链接使用chunk之后会造成textfiilereader object has no attribute ‘ ’错误。
此时应该输入
df = pd.concat((chunk for chunk in pd.read_csv(csvname,chunksize=5000)))相关链接相关链接1
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)