图书推荐系统(二)之数据爬取

在豆瓣网上爬取图书信息

1 前情提要

根据上一部分的内容,根据评价过的图书筛选用户,涉及人数太多了,比如出现最多的100本书有4.3w人评价过;按照用户来筛选图书的话,最活跃的50个用户和最不活跃的用户加起来有2.4w本图书。

2 方案

决定采用根据用户决定图书的方法,即采用100名用户读过2w+本图书的数据集。

3 任务

根据获取的这2w图书id到豆瓣爬图书封面、书名、标签等

4 思路

(1) 技术:request+BeautifulSoup

(2)建立文件:img的文件、图书信息的文件、用来记录出现error的图书id的文件

(3)流程:选找到url,加载整个网页,再选定要爬取的图书信息,存入文件中,记录出现error的图书id;再爬图书封面

(4) 遇到的坑:1、下载图片对IP管控比较严格。解决方案:先下载图书信息,再下载图书封面;2、IP容易被屏蔽。解决方案:连不同的wifi来换IP。

5 代码实现

(1)分析 豆瓣图网某一本图书的网页 这里的数字即代表上一部分内容获取的图书id

https://book.douban.com/subject/4913064/

(2)完整代码 get_book_mess.py

# -*- coding: utf-8 -*-
"""
    Author:JinHelen
    Date:2019-04-20
    Desc:获取图书相关信息
"""
from bs4 import BeautifulSoup
from urllib import request
import time
import os

class GetCateBooks:
    def __init__(self, select_book_path):
        self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
        # 需要获取的图书id文件
        self.select_books_path = select_book_path
        # 用来记录未正常获取数据的图书信息,方便用来补数
        self.books_mess_error = list()
        # 用来保存图书封面路径
        self.book_img_path = "data/img"
        # 用来记录图书信息的
        self.book_mess_path = "data/book_mess.txt"

        # 已经获取完成信息的图书id
        self.have_ids = self._load_data()

    def _load_data(self):
        ids = list()
        for one in open("data/book_mess.txt","r",encoding="utf-8").readlines():
            id = one.strip().split("\t")[0]
            ids.append(id)
        return ids

    # 将数据写入文件
    def _write_to_file(self, data, filename):
        fw = open(filename, "a", encoding="utf-8")
        fw.write(data)
        fw.close()

    # 获取整体的数据,返回body
    def _get_page_content(self, url):
        req = request.Request(url, headers=self.headers )
        try:
            res = request.urlopen(req)
            code_status = res.getcode()
            if code_status == 200:
                page = res.read().decode("utf-8")
                body = BeautifulSoup(page, "lxml").body
            else:
                body = None
            return body
        except Exception as e:
            print("请求时发生错误, 继续执行, {}".format(e))
            return None

    # 下载图书封面图片
    def _download_book_img(self, url, path):
        request.urlretrieve(url, path)

    # 获取图书的信息
    def _get_books_mess(self):
        print("开始获取所有图书的详细信息 ...")
        books_list = [one.strip() for one in open(self.select_books_path, "r", encoding="utf-8").readlines()]
        print("共有{}本图书信息需要获取 ...".format(books_list.__len__()))
        i = 0
        for book_id in books_list:
            href = "https://book.douban.com/subject/" + book_id
            print("{} => 获取链接为:{}的图书信息 ...".format(i+1,href))
            i = i+1

            # 随机sleep一段时间
            # sec = random.randint(1,6)
            # print(sec)
            # time.sleep(sec)

            book_mess = list()
            body = self._get_page_content(href)
            if body is not None:
                book_mess.append(book_id)
                try:
                    book_name = body.find_all("span",property="v:itemreviewed")[0].text
                    # print(book_name)
                    book_mess.append(book_name)
                except Exception as e:
                    print("ID为:{}的图书名字获取失败,异常为:{}".format(book_id,e))

                try:
                    book_img = body.find_all("a", class_="nbg")[0]["href"]
                    # print(book_img)
                    book_mess.append(book_img)
                except Exception as e:
                    print("ID为:{}的图书封面获取失败,异常为:{}".format(book_id, e))

                try:
                    book_avg_score = (body.find_all("strong",class_="ll rating_num")[0].text).replace(" ","")
                    # print(book_avg_score)
                    book_mess.append(book_avg_score)
                except Exception as e:
                    print("ID为:{}的图书评分获取失败,异常为:{}".format(book_id, e))

                book_tags = list()
                try:
                    div = body.find_all("div",class_="blank20")[0]
                    a_list = div.find_all("a")
                    for a in a_list:
                        book_tags.append(a.text)
                    # print(book_tags)
                    book_mess.append(",".join(book_tags))
                except Exception as e:
                    print("ID为:{}的图书标签获取失败,异常为:{}".format(book_id, e))

                # 将图书信息写入文件并下载图片
                if book_mess is not None:
                    self._write_to_file("\t".join(book_mess)+"\n",self.book_mess_path)
                    # if book_img:
                    #     try:
                    #         self._download_book_img(book_img, self.book_img_path + "/{}.jpg".format(book_id))
                    #     except Exception as e:
                    #         print(e)
                else:
                    self.books_mess_error.append(book_id)

    # 筛选出信息已经下载完成的图书id,将未获取完成的写入文件中,部分图片不下载,下载图片容易导致IP被禁
    def _select_have_get_bookid(self):
        not_get_list = set()
        books_list = [one.strip() for one in open(self.select_books_path, "r", encoding="utf-8").readlines()]
        i = 0
        for book_id in books_list:
            if book_id in self.have_ids:
                # pass
                if os.path.exists("data/img/{}.jpg".format(book_id)):
                    continue
                # 如果图书信息存在,图片不存在,则下载
                else:
                    href = "https://book.douban.com/subject/" + book_id
                    body = self._get_page_content(href)
                    if body is not None:
                        book_img = body.find_all("a", class_="nbg")
                        if book_img:
                            book_img = book_img[0]["href"]
                        else:
                            continue
                        try:
                            self._download_book_img(book_img, self.book_img_path + "/{}.jpg".format(book_id))
                        except Exception as e:
                            print(e)
                        i+=1
                        print("{} => 图书信息({})已经存在,下载图片完成!".format(i, href))
            else:
                not_get_list.add(book_id)
        fw=open("data/select_books_not_get.txt","a",encoding="utf-8")
        fw.write("\n".join( list(not_get_list) ))
        fw.close()

if __name__ == "__main__":
	 
    # books = GetCateBooks("data/select_books_not_get.txt")
    # books._get_books_mess()
    # print(books.books_mess_error)
    # books._write_to_file("\n".join(books.books_mess_error),"data/error.txt")
		
		#执行步骤:先运行下边的程序,得到图书id,是所有的图书id,然后再运行上面的程序,得到book_mess,得到图书信息;再运行下面的程序,可以得到图片。
		
    # 过滤得到未获取数据的图书ID
    books = GetCateBooks("data/select_books.txt")
    books._select_have_get_bookid()
							

(3)得到book_mess.txt文件:共计2.3w+条

4758387	并州迷雾	https://img1.doubanio.com/view/subject/l/public/s4377217.jpg	7.4	狄仁杰,推理,悬疑,小说,安娜芳芳,中国,同人作品,神探狄仁杰
1784978	男人的天方夜谭	https://img3.doubanio.com/view/subject/l/public/s1656293.jpg	6.4	李亚平,历史,杂文,读书笔记,小说,男人的天方夜谭,散文,随笔
1131678	招财狐狸	https://img3.doubanio.com/view/subject/l/public/s10431130.jpg	6.4	股票,花荣,投资,金融,操盘,投资理财,金融理财什么的,财经
1510382	济公全传(上下)	https://img3.doubanio.com/view/subject/l/public/s1508026.jpg	7.0	古典文学,济公全传,震后被盗,文学,小说,国人作品,四颗星,古典

..................

3241893	藏地密码4	https://img3.doubanio.com/view/subject/l/public/s3303626.jpg	7.8	小说,西藏,藏地密码,探险,宗教,何马,中国,奇幻
2030120	史学导论	https://img3.doubanio.com/view/subject/l/public/s2339042.jpg	8.9	历史,史学理论,历史研究入门读物,史学,史学导论,历史学,约翰·托什,思想