0


豆瓣书摘 | 爬虫 | Python

获取豆瓣书摘,存入MongoDB中。

import logging
import time

import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient

headers ={'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7','accept-language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6','cache-control':'max-age=0','priority':'u=0, i','sec-ch-ua':'"Chromium";v="130", "Microsoft Edge";v="130", "Not?A_Brand";v="99"','sec-ch-ua-mobile':'?0','sec-ch-ua-platform':'"Windows"','sec-fetch-dest':'document','sec-fetch-mode':'navigate','sec-fetch-site':'none','sec-fetch-user':'?1','upgrade-insecure-requests':'1','user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0',}

params ={'sort':'score','start':0,}# 连接到 MongoDB 服务器(假设在本地运行,默认端口 27017)
client = MongoClient('localhost',27017)# 选择数据库(如果数据库不存在,MongoDB 会在插入数据时自动创建)
db = client['douban_database']# 选择集合(如果集合不存在,MongoDB 会在插入数据时自动创建)
collection = db['blockquotes_1009393']for start inrange(0,1260,20):
    params['start']= start
    response = requests.get('https://book.douban.com/subject/1009393/blockquotes', params=params, headers=headers)
    text = response.text
    soup = BeautifulSoup(text,'lxml')iflen(soup.findAll("div", attrs={"class":"blockquote-list"}))==0:
        logging.error("blockquote-list is not exist")
        exit(1)
    blockquote_list = soup.findAll("div", attrs={"class":"blockquote-list"})[0]if blockquote_list isNone:
        logging.error("blockquote-list None")
        exit(1)
    figures = blockquote_list.findAll("figure")for figure in figures:if figure isNone:
            logging.warning("figure is None")continue
        data ={'author_avatar':None,'author_name':None,'likes':None,'datetime':None,'page_reference':None}try:
            data['author_avatar']= figure.find('img')['src']except:
            data['author_avatar']=None
            logging.error(figure)try:
            data['author_name']= figure.find('a', class_='author-name').text.strip()except:
            data['author_name']=None
            logging.error(figure)try:
            data['likes']= figure.find('span').text.strip().replace('赞','')except:
            data['likes']=None
            logging.error(figure)try:
            data['datetime']= figure.find('datetime').text.strip()except:
            data['datetime']=None
            logging.error(figure)try:
            data['page_reference']= figure.find('figcaption')['title']except:
            data['page_reference']=None
            logging.error(figure)try:
            blockquote_extra = figure.find('div', class_='blockquote-extra')
            a_href = figure.find('a')
            blockquote_extra.decompose()
            a_href.decompose()
            content = figure.text.strip().replace('()','')# print(content)
            data['content']= content
        except:
            data['content']=None
            logging.error(figure)try:pass
            collection.insert_one(data)except Exception as e:print(e)
    time.sleep(3)

效果图:
存入数据库效果图


本文转载自: https://blog.csdn.net/qq_41359651/article/details/143928267
版权归原作者 蜉蝣Sakura 所有, 如有侵权,请联系我们删除。

“豆瓣书摘 | 爬虫 | Python”的评论:

还没有评论