图书推荐系统(二)之数据爬取
在豆瓣网上爬取图书信息
1 前情提要
根据上一部分的内容,根据评价过的图书筛选用户,涉及人数太多了,比如出现最多的100本书有4.3w人评价过;按照用户来筛选图书的话,最活跃的50个用户和最不活跃的用户加起来有2.4w本图书。
2 方案
决定采用根据用户决定图书的方法,即采用100名用户读过2w+本图书的数据集。
3 任务
根据获取的这2w图书id到豆瓣爬图书封面、书名、标签等
4 思路
(1) 技术:request+BeautifulSoup
(2)建立文件:img的文件、图书信息的文件、用来记录出现error的图书id的文件
(3)流程:选找到url,加载整个网页,再选定要爬取的图书信息,存入文件中,记录出现error的图书id;再爬图书封面
(4) 遇到的坑:1、下载图片对IP管控比较严格。解决方案:先下载图书信息,再下载图书封面;2、IP容易被屏蔽。解决方案:连不同的wifi来换IP。
5 代码实现
(1)分析 豆瓣图网某一本图书的网页 这里的数字即代表上一部分内容获取的图书id
https://book.douban.com/subject/4913064/
(2)完整代码 get_book_mess.py
# -*- coding: utf-8 -*-
"""
Author:JinHelen
Date:2019-04-20
Desc:获取图书相关信息
"""
from bs4 import BeautifulSoup
from urllib import request
import time
import os
class GetCateBooks:
def __init__(self, select_book_path):
self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
# 需要获取的图书id文件
self.select_books_path = select_book_path
# 用来记录未正常获取数据的图书信息,方便用来补数
self.books_mess_error = list()
# 用来保存图书封面路径
self.book_img_path = "data/img"
# 用来记录图书信息的
self.book_mess_path = "data/book_mess.txt"
# 已经获取完成信息的图书id
self.have_ids = self._load_data()
def _load_data(self):
ids = list()
for one in open("data/book_mess.txt","r",encoding="utf-8").readlines():
id = one.strip().split("\t")[0]
ids.append(id)
return ids
# 将数据写入文件
def _write_to_file(self, data, filename):
fw = open(filename, "a", encoding="utf-8")
fw.write(data)
fw.close()
# 获取整体的数据,返回body
def _get_page_content(self, url):
req = request.Request(url, headers=self.headers )
try:
res = request.urlopen(req)
code_status = res.getcode()
if code_status == 200:
page = res.read().decode("utf-8")
body = BeautifulSoup(page, "lxml").body
else:
body = None
return body
except Exception as e:
print("请求时发生错误, 继续执行, {}".format(e))
return None
# 下载图书封面图片
def _download_book_img(self, url, path):
request.urlretrieve(url, path)
# 获取图书的信息
def _get_books_mess(self):
print("开始获取所有图书的详细信息 ...")
books_list = [one.strip() for one in open(self.select_books_path, "r", encoding="utf-8").readlines()]
print("共有{}本图书信息需要获取 ...".format(books_list.__len__()))
i = 0
for book_id in books_list:
href = "https://book.douban.com/subject/" + book_id
print("{} => 获取链接为:{}的图书信息 ...".format(i+1,href))
i = i+1
# 随机sleep一段时间
# sec = random.randint(1,6)
# print(sec)
# time.sleep(sec)
book_mess = list()
body = self._get_page_content(href)
if body is not None:
book_mess.append(book_id)
try:
book_name = body.find_all("span",property="v:itemreviewed")[0].text
# print(book_name)
book_mess.append(book_name)
except Exception as e:
print("ID为:{}的图书名字获取失败,异常为:{}".format(book_id,e))
try:
book_img = body.find_all("a", class_="nbg")[0]["href"]
# print(book_img)
book_mess.append(book_img)
except Exception as e:
print("ID为:{}的图书封面获取失败,异常为:{}".format(book_id, e))
try:
book_avg_score = (body.find_all("strong",class_="ll rating_num")[0].text).replace(" ","")
# print(book_avg_score)
book_mess.append(book_avg_score)
except Exception as e:
print("ID为:{}的图书评分获取失败,异常为:{}".format(book_id, e))
book_tags = list()
try:
div = body.find_all("div",class_="blank20")[0]
a_list = div.find_all("a")
for a in a_list:
book_tags.append(a.text)
# print(book_tags)
book_mess.append(",".join(book_tags))
except Exception as e:
print("ID为:{}的图书标签获取失败,异常为:{}".format(book_id, e))
# 将图书信息写入文件并下载图片
if book_mess is not None:
self._write_to_file("\t".join(book_mess)+"\n",self.book_mess_path)
# if book_img:
# try:
# self._download_book_img(book_img, self.book_img_path + "/{}.jpg".format(book_id))
# except Exception as e:
# print(e)
else:
self.books_mess_error.append(book_id)
# 筛选出信息已经下载完成的图书id,将未获取完成的写入文件中,部分图片不下载,下载图片容易导致IP被禁
def _select_have_get_bookid(self):
not_get_list = set()
books_list = [one.strip() for one in open(self.select_books_path, "r", encoding="utf-8").readlines()]
i = 0
for book_id in books_list:
if book_id in self.have_ids:
# pass
if os.path.exists("data/img/{}.jpg".format(book_id)):
continue
# 如果图书信息存在,图片不存在,则下载
else:
href = "https://book.douban.com/subject/" + book_id
body = self._get_page_content(href)
if body is not None:
book_img = body.find_all("a", class_="nbg")
if book_img:
book_img = book_img[0]["href"]
else:
continue
try:
self._download_book_img(book_img, self.book_img_path + "/{}.jpg".format(book_id))
except Exception as e:
print(e)
i+=1
print("{} => 图书信息({})已经存在,下载图片完成!".format(i, href))
else:
not_get_list.add(book_id)
fw=open("data/select_books_not_get.txt","a",encoding="utf-8")
fw.write("\n".join( list(not_get_list) ))
fw.close()
if __name__ == "__main__":
# books = GetCateBooks("data/select_books_not_get.txt")
# books._get_books_mess()
# print(books.books_mess_error)
# books._write_to_file("\n".join(books.books_mess_error),"data/error.txt")
#执行步骤:先运行下边的程序,得到图书id,是所有的图书id,然后再运行上面的程序,得到book_mess,得到图书信息;再运行下面的程序,可以得到图片。
# 过滤得到未获取数据的图书ID
books = GetCateBooks("data/select_books.txt")
books._select_have_get_bookid()
(3)得到book_mess.txt文件:共计2.3w+条
4758387 并州迷雾 https://img1.doubanio.com/view/subject/l/public/s4377217.jpg 7.4 狄仁杰,推理,悬疑,小说,安娜芳芳,中国,同人作品,神探狄仁杰
1784978 男人的天方夜谭 https://img3.doubanio.com/view/subject/l/public/s1656293.jpg 6.4 李亚平,历史,杂文,读书笔记,小说,男人的天方夜谭,散文,随笔
1131678 招财狐狸 https://img3.doubanio.com/view/subject/l/public/s10431130.jpg 6.4 股票,花荣,投资,金融,操盘,投资理财,金融理财什么的,财经
1510382 济公全传(上下) https://img3.doubanio.com/view/subject/l/public/s1508026.jpg 7.0 古典文学,济公全传,震后被盗,文学,小说,国人作品,四颗星,古典
..................
3241893 藏地密码4 https://img3.doubanio.com/view/subject/l/public/s3303626.jpg 7.8 小说,西藏,藏地密码,探险,宗教,何马,中国,奇幻
2030120 史学导论 https://img3.doubanio.com/view/subject/l/public/s2339042.jpg 8.9 历史,史学理论,历史研究入门读物,史学,史学导论,历史学,约翰·托什,思想