图书推荐系统(三)之数据处理

从原始数据中筛选出需要的数据集

1 、整理用户评分表

前一部分筛选出了100名用户和2.4w+本图书

整理得到用户id-图书id-评分表

思路: 先用全量数据user-bookid-score,若user在select_users.txt里并且bookid在book_mess.txt里,则得到新的表select_data.dat(user-bookid-score) get_new_data.py (1)完整代码如下:

# -*- coding: utf-8 -*-
"""
    Author:JinHelen
    Date:2019-04-21
    Desc:从原始数据中筛选出需要的数据集 & 结合爬取的数据进行筛选
"""

class SelectData:
    def __init__(self):
        # 后续使用的用户和图书存放的文件路径
        self.users_path = "data/select_users.txt"
        self.books_path = "data/select_books.txt"
        # 包含全量数据的数据集路径
        self.all_data_path = "data/douban.dat"
        # 用来保存后续要使用的数据
        self.select_data_path = "data/select_data.dat"
        # 已经获取图书信息的图书路径
        self.book_mess_path = "./data/book_mess.txt"
        # 已经获取信息的图书id
        self.have_get_book_ids = self._load_mess_data(self.book_mess_path)
        self.users = self._load_data(self.users_path)
        self.books = self._load_data(self.books_path)


    # 加载已经获取图书信息的ids
    def _load_mess_data(self,path):
        return [line.strip().split("\t")[0] for line in open(path,"r",encoding="utf-8")]

    # 加载数据
    def _load_data(self,path):
        return [one.strip() for one in open(path, "r", encoding="utf-8").readlines()]

    # 过滤得到所选择的用户涉及的图书评分
    def _select_use_data(self):
        _list = list()
        #先用全量数据douban.dat
        for line in open(self.all_data_path,"r",encoding="utf-8").readlines():
            user,bookid,score = line.strip().split("::")
            #user即为选择的这100名用户(select_users.txt文件),book为以爬取的图书信息(book_mess.txt文件)
            if user in self.users and bookid in self.have_get_book_ids:
                _list.append("{},{},{}".format(user,bookid,score))

        fw = open(self.select_data_path,"w",encoding="utf-8")
        fw.write("\n".join(_list))
        print("数据选择完毕,写入文件:{}".format(self.select_data_path))

if __name__ == "__main__":
    select = SelectData()
    select._select_use_data()

(2)输出select_data.txt文件(共记3.9w+条)

2668761,2354909,4
4191271,1904516,5
rearee.r,3327495,5
28921295,1507657,5
......
maohuhu,2339036,5
maohuhu,2274660,5
maohuhu,2339039,5
maohuhu,3115492,5
maohuhu,4187225,4

2、统计出现标签及出现的次数

(1)完整代码如下:

#-*- coding:utf-8 -*-

"""
    Author: JinHelen
    Date: 2019-04-21
    Desc: 统计出现标签及出现的次数
"""

import json

def _analysis_tag():
    tag_dict = dict()
    path = "./data/book_mess.txt"
    for line in open(path, "r", encoding="utf-8").readlines():
        _list = line.strip().split("\t")
        # 每一行的长度为5
        if _list.__len__()!=5:
            continue
        for tag in _list[-1].split(","):
            if tag !="":
                tag_dict.setdefault(tag,0)
                tag_dict[tag] += 1
    # 取前100个标签,排序
    new_tag_dict = sorted(tag_dict.items(),key=lambda k:k[1],reverse=True)[:100]
    json.dump(dict(new_tag_dict), open("data/tags.json", 'w', encoding="utf-8"))

_analysis_tag()

(2)得到tags.json

{"\u5c0f\u8bf4": 5849, "\u5386\u53f2": 3514, "\u65e5\u672c": 2959,
.......
"\u5b66\u672f": 256, "\u6d77\u5916\u4e2d\u56fd\u7814\u7a76": 256}