报错:AttributeError: ‘Dataframe’ object has no attribute ‘dtype’
错误原因:scikit-learn从0.20.1不再自动修改numpy语法。所以需要养成更加规范的书写习惯解决
pairwise_distances(df, metric="jaccard")`修改为pairwise_distances(df.values, metric="jaccard")`
from sklearn.metrics import jaccard_similarity_score
修改为:from sklearn.metrics import jaccard_score
.ix()运行可能会报错,需要修改为loc,上面的链接为几个函数的区别
案例1:基于用户的协议过滤import pandas as pd import numpy as np from pprint import pprint users = ["User1", "User2", "User3", "User4", "User5"] items = ["Item A", "Item B", "Item C", "Item D", "Item E"] # 用户购买记录数据集 datasets = [ [1, 0, 1, 1, 0], [1, 0, 0, 1, 1], [1, 0, 1, 0, 0], [0, 1, 0, 1, 1], [1, 1, 1, 0, 1], ] df = pd.Dataframe(datasets, columns=items, index=users) """ Item A Item B Item C Item D Item E User1 1 0 1 1 0 User2 1 0 0 1 1 User3 1 0 1 0 0 User4 0 1 0 1 1 User5 1 1 1 0 1 """ # 计算所有的数据两两的杰卡德相似系数 from sklearn.metrics.pairwise import pairwise_distances # 计算用户间相似度 user_similar = 1 - pairwise_distances(df.values, metric="jaccard") user_similar = pd.Dataframe(user_similar, columns=users, index=users) print("用户之间的两两相似度:") print(user_similar) topN_users = {} # 遍历每一行数据 for i in user_similar.index: # 取出每一列数据,并删除自身,然后排序数据 _df = user_similar.loc[i].drop([i]) # 拿到每一行的数据并且剔除自己那一行(自己和自己相似度一定是一摸一样) _df_sorted = _df.sort_values(ascending=False) # 降序排列 top2 = list(_df_sorted.index[:2]) # 切片转化list topN_users[i] = top2 print("Top2相似用户:") pprint(topN_users) rs_results = {} # 构建推荐结果 for user, sim_users in topN_users.items(): rs_result = set() # 存储推荐结果 set()集合能去重 for sim_user in sim_users: # 构建初始的推荐结果 rs_result = rs_result.union(set(df.loc[sim_user].replace(0, np.nan).dropna().index)) # 过滤掉已经购买过的物品 rs_result -= set(df.loc[user].replace(0, np.nan).dropna().index) rs_results[user] = rs_result print("最终推荐结果:") pprint(rs_results)案例2:基于协同过滤的电影推荐
import pandas as pd import numpy as np def load_data(data_path): print("开始加载数据集...") if os.path.exists(cache_path): # 判断是否存在缓存文件 print("加载缓存中...") ratings_matrix = pd.read_pickle(cache_path) print("从缓存加载数据集完毕") else: print("加载新数据中...") # 设置要加载的数据字段的类型 dtype = {"userId": np.int32, "movieId": np.int32, "rating": np.float32} # 加载数据,我们只用前三列数据,分别是用户ID,电影ID,已经用户对电影的对应评分 ratings = pd.read_csv(data_path, dtype=dtype, usecols=range(3)) # 透视表,将电影ID转换为列名称,转换成为一个User-Movie的评分矩阵 ratings_matrix = ratings.pivot_table(index=["userId"], columns=["movieId"], values="rating") # 存入缓存文件 ratings_matrix.to_pickle(cache_path) print("数据集加载完毕") return ratings_matrix def compute_pearson_similarity(ratings_matrix, based="user"): user_similarity_cache_path = os.path.join(CACHE_DIR, "user_similarity.cache") item_similarity_cache_path = os.path.join(CACHE_DIR, "item_similarity.cache") if based == "user": if os.path.exists(user_similarity_cache_path): print("正从缓存加载用户相似度矩阵") similarity = pd.read_pickle(user_similarity_cache_path) else: print("开始计算用户相似度矩阵") similarity = ratings_matrix.T.corr() similarity.to_pickle(user_similarity_cache_path) elif based == "item": if os.path.exists(item_similarity_cache_path): print("正从缓存加载物品相似度矩阵") similarity = pd.read_pickle(item_similarity_cache_path) else: print("开始计算物品相似度矩阵") similarity = ratings_matrix.corr() similarity.to_pickle(item_similarity_cache_path) else: raise Exception("Unhandled 'based' Value: %s"%based) print("相似度矩阵计算/加载完毕") return similarity if __name__ == '__main__': ratings_matrix = load_data(DATA_PATH) user_similar = compute_pearson_similarity(ratings_matrix, based="user") print(user_similar) item_similar = compute_pearson_similarity(ratings_matrix, based="item") print(item_similar)
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)