返回顶部

收藏

基于用户的协同过滤算法

更多

本代码是在学习《推荐系统实践》一书时完成的,参考了原书作者项亮的算法完成了程序,并且可运行和测试,本部分是基于用户的协同过滤算法的实现,基本与书上结论一致, 希望对于学习推荐的同学有帮助。

import random
import math
class UserBasedCF:
    def __init__(self,datafile = None):
        self.datafile = datafile
        self.readData()
        self.splitData(3,47)
    def readData(self,datafile = None):
        """
        read the data from the data file which is a data set
        """
        self.datafile = datafile or self.datafile
        self.data = []
        for line in open(self.datafile):
            userid,itemid,record,_ = line.split()
            self.data.append((userid,itemid,int(record)))
    def splitData(self,k,seed,data=None,M = 8):
        """
        split the data set
        testdata is a test data set
        traindata is a train set 
        test data set / train data set is 1:M-1
        """
        self.testdata = {}
        self.traindata = {}
        data = data or self.data
        random.seed(seed)
        for user,item, record in self.data:
            if random.randint(0,M) == k:
                self.testdata.setdefault(user,{})
                self.testdata[user][item] = record 
            else:
                self.traindata.setdefault(user,{})
                self.traindata[user][item] = record
    def userSimilarity(self,train = None):
        """
        One method of getting user similarity matrix
        """
        train = train or self.traindata
        self.userSim = dict()
        for u in train.keys():
            for v in train.keys():
                if u == v:
                    continue
                self.userSim.setdefault(u,{})
                self.userSim[u][v] = len(set(train[u].keys()) & set(train[v].keys()))
                self.userSim[u][v] /=math.sqrt(len(train[u]) * len(train[v]) *1.0)
    def userSimilarityBest(self,train = None):
        """
        the other method of getting user similarity which is better than above
        you can get the method on page 46
        In this experiment,we use this method
        """
        train = train or self.traindata
        self.userSimBest = dict()
        item_users = dict()
        for u,item in train.items():
            for i in item.keys():
                item_users.setdefault(i,set())
                item_users[i].add(u)
        user_item_count = dict()
        count = dict()
        for item,users in item_users.items():
            for u in users:
                user_item_count.setdefault(u,0)
                user_item_count[u] += 1
                for v in users:
                    if u == v:continue
                    count.setdefault(u,{})
                    count[u].setdefault(v,0)
                    count[u][v] += 1
        for u ,related_users in count.items():
            self.userSimBest.setdefault(u,dict())
            for v, cuv in related_users.items():
                self.userSimBest[u][v] = cuv / math.sqrt(user_item_count[u] * user_item_count[v] * 1.0)

    def recommend(self,user,train = None,k = 8,nitem = 40):
        train = train or self.traindata
        rank = dict()
        interacted_items = train.get(user,{})
        for v ,wuv in sorted(self.userSimBest[user].items(),key = lambda x : x[1],reverse = True)[0:k]:
            for i , rvi in train[v].items():
                if i in interacted_items:
                    continue
                rank.setdefault(i,0)
                rank[i] += wuv
        return dict(sorted(rank.items(),key = lambda x :x[1],reverse = True)[0:nitem])
    def recallAndPrecision(self,train = None,test = None,k = 8,nitem = 10):
        """
        Get the recall and precision, the method you want to know is listed 
        in the page 43
        """
        train  = train or self.traindata
        test = test or self.testdata
        hit = 0
        recall = 0
        precision = 0
        for user in train.keys():
            tu = test.get(user,{})
            rank = self.recommend(user, train = train,k = k,nitem = nitem) 
            for item,_ in rank.items():
                if item in tu:
                    hit += 1
            recall += len(tu)
            precision += nitem
        return (hit / (recall * 1.0),hit / (precision * 1.0))
    def coverage(self,train = None,test = None,k = 8,nitem = 10):
        train = train or self.traindata
        test = test or self.testdata
        recommend_items = set()
        all_items  = set()
        for user in train.keys():
            for item in train[user].keys():
                all_items.add(item)
            rank = self.recommend(user, train, k = k, nitem = nitem)
            for item,_ in rank.items():
                recommend_items.add(item)
        return len(recommend_items) / (len(all_items) * 1.0)
    def popularity(self,train = None,test = None,k = 8,nitem = 10):
        """
        Get the popularity
        the algorithm on page 44
        """
        train = train or self.traindata
        test = test or self.testdata
        item_popularity = dict()
        for user ,items in train.items():
            for item in items.keys():
                item_popularity.setdefault(item,0)
                item_popularity[item] += 1
        ret = 0
        n = 0
        for user in train.keys():
            rank = self.recommend(user, train, k = k, nitem = nitem)
            for item ,_ in rank.items():
                ret += math.log(1+item_popularity[item])
                n += 1
        return ret / (n * 1.0)

def testRecommend():
    ubcf = UserBasedCF('u.data')
    ubcf.readData()
    ubcf.splitData(4,100)
    ubcf.userSimilarity()
    user = "345"
    rank = ubcf.recommend(user,k = 3)
    for i,rvi in rank.items():

        items = ubcf.testdata.get(user,{})
        record = items.get(i,0)
        print "%5s: %.4f--%.4f" %(i,rvi,record)
def testUserBasedCF():
    cf  =  UserBasedCF('u.data')
    cf.userSimilarityBest()
    print "%3s%20s%20s%20s%20s" % ('K',"recall",'precision','coverage','popularity')
    for k in [5,10,20,40,80,160]:
        recall,precision = cf.recallAndPrecision( k = k)
        coverage = cf.coverage(k = k)
        popularity = cf.popularity(k = k)
        print "%3d%19.3f%%%19.3f%%%19.3f%%%20.3f" % (k,recall * 100,precision * 100,coverage * 100,popularity)

if __name__ == "__main__":
    testUserBasedCF()
#该片段来自于http://outofmemory.cn

标签:python,算法

收藏

0人收藏

支持

0

反对

0

»更多 您可能感兴趣的代码
  1. 2014-12-01 13:15:29Python冒泡排序 by Solon.Ring
  2. 2014-06-25 16:19:02带有重复的全排列 by xwz
  3. 2014-07-17 15:53:173行Python代码解简单的一元一次方程 by 大猪
  4. 2014-08-13 14:21:23求最大最小最大值因数 by wubai
  5. 2014-06-24 20:52:32寻找递增最长子序列 by 大猪
  6. 2014-06-25 20:05:45Python寻找第k小的元素 by 大猪
  7. 2014-06-26 18:34:05Python堆排序(最大堆) by xwz
  8. 2014-06-30 14:08:07一个简单的二叉树实现 by 张宋付
  9. 2014-06-30 18:17:56python果然适合演示算法 by 蟋蟀MM
  10. 2014-09-08 14:37:32Python 冒泡排序 by 阮小七
  11. 2018-08-21 11:54:27java代码判断数字是否是2的n次方 by 甄码农
相关聚客文章
  1. 0X55AA 发表 2015-01-31 13:54:10 python contextlib
  2. 博主 发表 2014-12-22 16:18:37 经典排序算法总结与实现
  3. 比特币 发表 2015-09-10 15:11:04 随机森林入门攻略(内含R、Python代码)
  4. master 发表 2015-11-20 05:43:14 一个从Linux镜像站点递归下载文件的脚本
  5. 0X55AA 发表 2014-03-19 08:43:30 sqlalchemy orm create_engine 设置数据库连接timeout
  6. admin 发表 2014-08-15 02:43:19 【算法&数据结构】再次学算法之栈
  7. 小数点 发表 2017-04-18 02:43:37 python学习之路——python切片模拟LRU算法
  8. 姚 广远 发表 2015-06-19 00:23:25 用Python实现各种排序算法
  9. rainy 发表 2015-11-25 03:05:26 图像主题色提取算法
  10. fox64194167 发表 2018-05-27 00:12:22 python 搜索插入位置
  11. TLHL28 发表 2011-05-23 03:20:37 triple_des(des3) 算法 - php,python 实现
  12. Yushneng 发表 2016-04-25 13:51:00 可视化图的基本算法

发表评论