图书推荐系统(三)之数据处理
从原始数据中筛选出需要的数据集
1 、整理用户评分表
前一部分筛选出了100名用户和2.4w+本图书
整理得到用户id-图书id-评分表
思路: 先用全量数据user-bookid-score,若user在select_users.txt里并且bookid在book_mess.txt里,则得到新的表select_data.dat(user-bookid-score) get_new_data.py (1)完整代码如下:
# -*- coding: utf-8 -*-
"""
Author:JinHelen
Date:2019-04-21
Desc:从原始数据中筛选出需要的数据集 & 结合爬取的数据进行筛选
"""
class SelectData:
def __init__(self):
# 后续使用的用户和图书存放的文件路径
self.users_path = "data/select_users.txt"
self.books_path = "data/select_books.txt"
# 包含全量数据的数据集路径
self.all_data_path = "data/douban.dat"
# 用来保存后续要使用的数据
self.select_data_path = "data/select_data.dat"
# 已经获取图书信息的图书路径
self.book_mess_path = "./data/book_mess.txt"
# 已经获取信息的图书id
self.have_get_book_ids = self._load_mess_data(self.book_mess_path)
self.users = self._load_data(self.users_path)
self.books = self._load_data(self.books_path)
# 加载已经获取图书信息的ids
def _load_mess_data(self,path):
return [line.strip().split("\t")[0] for line in open(path,"r",encoding="utf-8")]
# 加载数据
def _load_data(self,path):
return [one.strip() for one in open(path, "r", encoding="utf-8").readlines()]
# 过滤得到所选择的用户涉及的图书评分
def _select_use_data(self):
_list = list()
#先用全量数据douban.dat
for line in open(self.all_data_path,"r",encoding="utf-8").readlines():
user,bookid,score = line.strip().split("::")
#user即为选择的这100名用户(select_users.txt文件),book为以爬取的图书信息(book_mess.txt文件)
if user in self.users and bookid in self.have_get_book_ids:
_list.append("{},{},{}".format(user,bookid,score))
fw = open(self.select_data_path,"w",encoding="utf-8")
fw.write("\n".join(_list))
print("数据选择完毕,写入文件:{}".format(self.select_data_path))
if __name__ == "__main__":
select = SelectData()
select._select_use_data()
(2)输出select_data.txt文件(共记3.9w+条)
2668761,2354909,4
4191271,1904516,5
rearee.r,3327495,5
28921295,1507657,5
......
maohuhu,2339036,5
maohuhu,2274660,5
maohuhu,2339039,5
maohuhu,3115492,5
maohuhu,4187225,4
2、统计出现标签及出现的次数
(1)完整代码如下:
#-*- coding:utf-8 -*-
"""
Author: JinHelen
Date: 2019-04-21
Desc: 统计出现标签及出现的次数
"""
import json
def _analysis_tag():
tag_dict = dict()
path = "./data/book_mess.txt"
for line in open(path, "r", encoding="utf-8").readlines():
_list = line.strip().split("\t")
# 每一行的长度为5
if _list.__len__()!=5:
continue
for tag in _list[-1].split(","):
if tag !="":
tag_dict.setdefault(tag,0)
tag_dict[tag] += 1
# 取前100个标签,排序
new_tag_dict = sorted(tag_dict.items(),key=lambda k:k[1],reverse=True)[:100]
json.dump(dict(new_tag_dict), open("data/tags.json", 'w', encoding="utf-8"))
_analysis_tag()
(2)得到tags.json
{"\u5c0f\u8bf4": 5849, "\u5386\u53f2": 3514, "\u65e5\u672c": 2959,
.......
"\u5b66\u672f": 256, "\u6d77\u5916\u4e2d\u56fd\u7814\u7a76": 256}