图书推荐系统(四)之基于物品的协同过滤

利用select_data.txt(user-bookid-score),采用基于物品的协同过滤算法,计算物品相似度,为每个用户推荐20个物品。

1、完整代码如下

item_based_cf.py

# -*- coding: utf-8 -*-
"""
   Author: JinHelen
   Date: 2019-04-21
   Desc: 基于物品的协同过滤算法,产出:
       1: 每个用户的20个推荐item
       2: 每个item最相似的20个item
   思路:
       1、加载用户id-图书id-评分数据到data文件
       2、计算物品间相似度。
       (1)item_user_count记录评价过物品i的人数
       (2)count为共现矩阵,count={物品i:{物品j:同时为物品i和物品j评分的用户数}}
       (3)相似度=同时为物品i和物品j评分的用户数/math.sqrt(item_user_count[i]*item_user_count[j])
       3、构造用户i与用户j的相似度矩阵
       4、排序,每个item只保留最相似的20个item,写入data/item_sim.json
       5、为用户进行推荐。用户对物品j的兴趣度 = 用户对物品i的评分*相似度
       6、构造循环,为每个用户推荐20本图书,写入user_rec_result.txt。
"""
import random
import math
import os
import json

class ItemBasedCF:
   def __init__(self,datafile):
       self.datafile = datafile
       self.data = self.loadData()
       self.items_sim = self.ItemSimilarityBest()

   # 加载评分数据到data
   def loadData(self):
       print("加载数据...")
       data = dict()
       for line in open(self.datafile):
           userid,itemid,score = line.strip().split(",")
           #data = {userid:{itemid:score}}
           data.setdefault(userid,{})
           data[userid][itemid] = int(score)
       return data

   # 计算物品之间相似度
   def ItemSimilarityBest(self):
       print("开始计算物品之间的相似度")
       if os.path.exists("data/item_sim.json"):
           itemSim = json.load(open("data/item_sim.json","r"))
       else:
           itemSim = dict()
           # 得到每个物品有多少用户产生行为
           item_user_count = dict()
           # 共现矩阵
           count = dict()

           for user,item in self.data.items():
               print("user is {}".format(user))
               # item.keys=itemid
               for i in item.keys():
                   # dict.setdefault(key, default=None)。key – 查找的键值,default – 键不存在时,设置的默认键值。
                   item_user_count.setdefault(i,0)
                   if self.data[str(user)][i]>0.0:
                       # 物品i有多少个用户评分
                       item_user_count[i] += 1
                   for j in item.keys():
                       # count={物品i:{物品j:同时为物品i和物品j评分的用户数}}
                       count.setdefault(i,{}).setdefault(j,0)
                       if self.data[str(user)][i] > 0.0 and self.data[str(user)][j] > 0.0 and i != j:
                           count[i][j] = 1

               # 共现矩阵 ->相似度矩阵
               # i:物品i,related_items = {物品j:同时为物品i和物品j评分的用户数}
               for i ,related_items in count.items():
                   itemSim.setdefault(i,dict())
                   for j,cuv in related_items.items():
                       # itemSim = {物品1:{物品j:相似度}}
                       itemSim[i].setdefault(j, 0)
                       itemSim[i][j] = cuv/math.sqrt(item_user_count[i]*item_user_count[j])

           # 排序,每个item只保留最相似的20个item
           new_itemSim = dict()
           for item in itemSim.keys():
               item_rate = itemSim[item]
               new_itemSim[item] = dict(sorted(item_rate.items(),key=lambda k:k[1],reverse=True)[:20])

           json.dump(new_itemSim,open('data/item_sim.json','w'))
           return itemSim

   # 为用户进行推荐,user:用户,k:k个近邻物品,nitem:总共返回n个物品
   def recommend(self,user,k=8,nitem=20):
       result = dict()
       # u_items = {user:{item:score}}
       u_items = self.data.get(user,{})
       # i:物品i,pi:用户对物品i的评分
       for i,pi in u_items.items():
           # j:物品j,wj:选出与i相似的前8个物品j,排序与i的相似度
           for j,wj in sorted(self.items_sim[i].items(),key=lambda x:x[1],reverse=True)[:k]:
               if j in u_items:
                   continue
               result.setdefault(j,0)
               #用户对物品j的兴趣度 = 用户对物品i的评分*相似度
               result[j] += pi * wj
       return dict(sorted(result.items(),key=lambda x:x[1],reverse=True)[:nitem])

if __name__ == "__main__":
   ib = ItemBasedCF("data/select_data.dat")

   #为每个用户推荐20本图书
   for user in ib.data.keys():
       _list = list()
       result = ib.recommend(user)
       print("用户{}进行推荐的结果如下:{}".format(user,result))
       for key in result:
           #用户id,图书id,推荐相似度
           _list.append("{},{},{}".format(user,key,result[key]))

       fw = open("data/user_rec_result.txt","a",encoding="utf-8")
       fw.write("\n".join(_list)+"\n")
       fw.close()

2、输出结果

(1)item_sim.json(计算图书相似度)

```
{"2354909": {"1006644": 0.7071067811865475, "3228662": 0.7071067811865475, "4010186": 0.7071067811865475, "1392712": 0.7071067811865475, "2063029": 0.7071067811865475, "3836277": 0.7071067811865475, "1460381": 0.7071067811865475, "3891900": 0.7071067811865475, "1957426": 0.7071067811865475, "1000121": 0.7071067811865475, "2349859": 0.7071067811865475, "1001388": 0.7071067811865475, "3063891": 0.7071067811865475, "2044798": 0.7071067811865475, "3842957": 0.7071067811865475, "3063893": 0.7071067811865475, "3234345": 0.7071067811865475, "3181840": 0.7071067811865475, "1443008": 0.7071067811865475, "1193912": 0.7071067811865475}, "1904516": {"2071128": 0.7071067811865475, "4745595": 0.7071067811865475, "1015498": 0

.........

"3003684": 1.0, "1344028": 1.0, "3876270": 1.0, "1163378": 1.0, "3128797": 1.0, "1101648": 1.0, "2315091": 1.0, "4082434": 1.0, "4736101": 1.0}, "4187225": {"1322457": 1.0, "2782115": 1.0, "2250081": 1.0, "3632643": 1.0, "4817597": 1.0, "4245043": 1.0, "2222835": 1.0, "2208526": 1.0, "3738701": 1.0, "2364723": 1.0, "1189975": 1.0, "3003684": 1.0, "1344028": 1.0, "3876270": 1.0, "1163378": 1.0, "3128797": 1.0, "1101648": 1.0, "2315091": 1.0, "4082434": 1.0, "4736101": 1.0}}
```

(2)user_rec_result.txt(为用户推荐图书)

2668761,1006644,2.82842712474619
2668761,3228662,2.82842712474619
2668761,4010186,2.82842712474619
2668761,1392712,2.82842712474619
2668761,2063029,2.82842712474619
2668761,3836277,2.82842712474619
2668761,1460381,2.82842712474619
2668761,3891900,2.82842712474619
4191271,2071128,3.5355339059327373
4191271,4745595,3.5355339059327373
4191271,1015498,3.5355339059327373
......
maohuhu,1276420,48.587025387966754
maohuhu,4062752,48.587025387966754
maohuhu,2268621,48.587025387966754
maohuhu,3928883,48.587025387966754