图书推荐系统(四)之基于物品的协同过滤
利用select_data.txt(user-bookid-score),采用基于物品的协同过滤算法,计算物品相似度,为每个用户推荐20个物品。
1、完整代码如下
item_based_cf.py
# -*- coding: utf-8 -*-
"""
Author: JinHelen
Date: 2019-04-21
Desc: 基于物品的协同过滤算法,产出:
1: 每个用户的20个推荐item
2: 每个item最相似的20个item
思路:
1、加载用户id-图书id-评分数据到data文件
2、计算物品间相似度。
(1)item_user_count记录评价过物品i的人数
(2)count为共现矩阵,count={物品i:{物品j:同时为物品i和物品j评分的用户数}}
(3)相似度=同时为物品i和物品j评分的用户数/math.sqrt(item_user_count[i]*item_user_count[j])
3、构造用户i与用户j的相似度矩阵
4、排序,每个item只保留最相似的20个item,写入data/item_sim.json
5、为用户进行推荐。用户对物品j的兴趣度 = 用户对物品i的评分*相似度
6、构造循环,为每个用户推荐20本图书,写入user_rec_result.txt。
"""
import random
import math
import os
import json
class ItemBasedCF:
def __init__(self,datafile):
self.datafile = datafile
self.data = self.loadData()
self.items_sim = self.ItemSimilarityBest()
# 加载评分数据到data
def loadData(self):
print("加载数据...")
data = dict()
for line in open(self.datafile):
userid,itemid,score = line.strip().split(",")
#data = {userid:{itemid:score}}
data.setdefault(userid,{})
data[userid][itemid] = int(score)
return data
# 计算物品之间相似度
def ItemSimilarityBest(self):
print("开始计算物品之间的相似度")
if os.path.exists("data/item_sim.json"):
itemSim = json.load(open("data/item_sim.json","r"))
else:
itemSim = dict()
# 得到每个物品有多少用户产生行为
item_user_count = dict()
# 共现矩阵
count = dict()
for user,item in self.data.items():
print("user is {}".format(user))
# item.keys=itemid
for i in item.keys():
# dict.setdefault(key, default=None)。key – 查找的键值,default – 键不存在时,设置的默认键值。
item_user_count.setdefault(i,0)
if self.data[str(user)][i]>0.0:
# 物品i有多少个用户评分
item_user_count[i] += 1
for j in item.keys():
# count={物品i:{物品j:同时为物品i和物品j评分的用户数}}
count.setdefault(i,{}).setdefault(j,0)
if self.data[str(user)][i] > 0.0 and self.data[str(user)][j] > 0.0 and i != j:
count[i][j] = 1
# 共现矩阵 ->相似度矩阵
# i:物品i,related_items = {物品j:同时为物品i和物品j评分的用户数}
for i ,related_items in count.items():
itemSim.setdefault(i,dict())
for j,cuv in related_items.items():
# itemSim = {物品1:{物品j:相似度}}
itemSim[i].setdefault(j, 0)
itemSim[i][j] = cuv/math.sqrt(item_user_count[i]*item_user_count[j])
# 排序,每个item只保留最相似的20个item
new_itemSim = dict()
for item in itemSim.keys():
item_rate = itemSim[item]
new_itemSim[item] = dict(sorted(item_rate.items(),key=lambda k:k[1],reverse=True)[:20])
json.dump(new_itemSim,open('data/item_sim.json','w'))
return itemSim
# 为用户进行推荐,user:用户,k:k个近邻物品,nitem:总共返回n个物品
def recommend(self,user,k=8,nitem=20):
result = dict()
# u_items = {user:{item:score}}
u_items = self.data.get(user,{})
# i:物品i,pi:用户对物品i的评分
for i,pi in u_items.items():
# j:物品j,wj:选出与i相似的前8个物品j,排序与i的相似度
for j,wj in sorted(self.items_sim[i].items(),key=lambda x:x[1],reverse=True)[:k]:
if j in u_items:
continue
result.setdefault(j,0)
#用户对物品j的兴趣度 = 用户对物品i的评分*相似度
result[j] += pi * wj
return dict(sorted(result.items(),key=lambda x:x[1],reverse=True)[:nitem])
if __name__ == "__main__":
ib = ItemBasedCF("data/select_data.dat")
#为每个用户推荐20本图书
for user in ib.data.keys():
_list = list()
result = ib.recommend(user)
print("用户{}进行推荐的结果如下:{}".format(user,result))
for key in result:
#用户id,图书id,推荐相似度
_list.append("{},{},{}".format(user,key,result[key]))
fw = open("data/user_rec_result.txt","a",encoding="utf-8")
fw.write("\n".join(_list)+"\n")
fw.close()
2、输出结果
(1)item_sim.json(计算图书相似度)
```
{"2354909": {"1006644": 0.7071067811865475, "3228662": 0.7071067811865475, "4010186": 0.7071067811865475, "1392712": 0.7071067811865475, "2063029": 0.7071067811865475, "3836277": 0.7071067811865475, "1460381": 0.7071067811865475, "3891900": 0.7071067811865475, "1957426": 0.7071067811865475, "1000121": 0.7071067811865475, "2349859": 0.7071067811865475, "1001388": 0.7071067811865475, "3063891": 0.7071067811865475, "2044798": 0.7071067811865475, "3842957": 0.7071067811865475, "3063893": 0.7071067811865475, "3234345": 0.7071067811865475, "3181840": 0.7071067811865475, "1443008": 0.7071067811865475, "1193912": 0.7071067811865475}, "1904516": {"2071128": 0.7071067811865475, "4745595": 0.7071067811865475, "1015498": 0
.........
"3003684": 1.0, "1344028": 1.0, "3876270": 1.0, "1163378": 1.0, "3128797": 1.0, "1101648": 1.0, "2315091": 1.0, "4082434": 1.0, "4736101": 1.0}, "4187225": {"1322457": 1.0, "2782115": 1.0, "2250081": 1.0, "3632643": 1.0, "4817597": 1.0, "4245043": 1.0, "2222835": 1.0, "2208526": 1.0, "3738701": 1.0, "2364723": 1.0, "1189975": 1.0, "3003684": 1.0, "1344028": 1.0, "3876270": 1.0, "1163378": 1.0, "3128797": 1.0, "1101648": 1.0, "2315091": 1.0, "4082434": 1.0, "4736101": 1.0}}
```
(2)user_rec_result.txt(为用户推荐图书)
2668761,1006644,2.82842712474619
2668761,3228662,2.82842712474619
2668761,4010186,2.82842712474619
2668761,1392712,2.82842712474619
2668761,2063029,2.82842712474619
2668761,3836277,2.82842712474619
2668761,1460381,2.82842712474619
2668761,3891900,2.82842712474619
4191271,2071128,3.5355339059327373
4191271,4745595,3.5355339059327373
4191271,1015498,3.5355339059327373
......
maohuhu,1276420,48.587025387966754
maohuhu,4062752,48.587025387966754
maohuhu,2268621,48.587025387966754
maohuhu,3928883,48.587025387966754