趣笔阁爬虫实验

用BeautifulSoup解析网页结构，爬取指定小说的页面，将每个章节的内容保存到txt文件中

可以改进的点：（待更新

1.反爬措施

2.多线程

3.保存为markdown格式更加美观

import os
import re
import requests
from bs4 import BeautifulSoup
import time
defgetHtml(url, param=None, encoding=None):# 获取url内的html文本内容try:# 构造访问头
        header ={'user-agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"}# 获取返回
        html = requests.get(url, headers=header, timeout=10)# 定义编码方式if encoding isNone:
            encoding = html.apparent_encoding
        html.encoding = encoding
        # 保存html文本
        content = html.text
        # 关闭连接
        html.close()# 间隔1s 防止过快访问
        time.sleep(1)# 返回网页内容return content
    except requests.RequestException as e:print(f"请求失败: {e}")returnNone# 返回None表示失败defgetLink(content):
    soup = BeautifulSoup(content,"html.parser")
    res =[]# 查找所有章节标题的标签
    titles = soup.find_all('dd')# 提取每个章节的文本for title in titles:
        a_tag = title.find('a')if a_tag:
            name = a_tag.text
            res.append(name)print(res)return res

defsave(name, passage_content, path):# 定义文件路径，文件名使用章节名称
    file_path = os.path.join(path,f"{name}.txt")# 将连续空格替换为换行符，并将其写入文件withopen(file_path,'w', encoding='utf-8')asfile:for content in passage_content:# 替换连续空格为换行符
            formatted_content = re.sub(r'\s{2,}','\n', content.text)file.write(formatted_content +"\n")# 额外加上换行符，使得段落更清晰print(f"章节 {name} 已保存")defsaveImg(title,imgLink,path):# 定义文件路径，文件名使用章节名称
    file_path = os.path.join(path,f"{title}.jpg")

    response = requests.get(imgLink)# 将连续空格替换为换行符，并将其写入文件if response.status_code ==200:withopen(file_path,'wb')asfile:file.write(response.content)# 额外加上换行符，使得段落更清晰print(f"图片 {title} 已保存")else:print(f"图片 {title} 保存失败")defgetMain(content):
    soup = BeautifulSoup(content,"html.parser")
    name =''
    titles = soup.find_all('div',attrs={"class":"info"})# 提取每个章节的文本for title in titles:
        img_tag = title.find('img')
    img = img_tag['src']# print(img_tag['src'])
    name = img_tag['alt']# print(img_tag['alt'])return name,img

defgetConcent(root,titles,path):
    pat = re.compile(r'第(.*?)章')for title in titles:
        res = pat.search(title)if res:print(res.groups())
            page = res.group(1)
            url = root + page +".html"
            content = getHtml(url)
            soup = BeautifulSoup(content,"html.parser")# print(content)
            passage =[]
            passage_content = soup.find_all("div",attrs={"id":"chaptercontent"})for item in passage_content:
                passage.append(item.text)print(item.text)print(markdownify.markdownify(item.text))# print(passage_content)
            save(title, passage_content, path)if __name__ =="__main__":try:# 目标贴吧
        url ="https://www.3bqg.cc/book/152484/"# 目标输出csv文件路径
        path ="./novel/"# 获取目标网页的源码
        content = getHtml(url)

        title,img = getMain(content)

        saveImg(title,img,path+title)

        titles = getLink(content)

        getConcent(url,titles,path+title)except Exception as e:print(f"程序运行出错: {e}")

标签：爬虫 python

本文转载自: https://blog.csdn.net/XUE_DING_E/article/details/142534502
版权归原作者 XUE_DING_E 所有，如有侵权，请联系我们删除。

趣笔阁爬虫实验

趣笔阁爬虫实验

发表评论

“趣笔阁爬虫实验”的评论:

关于作者

overfit同步小助手

相关阅读

文章导航