0


使用selenium突破QQ小说动态cookie,爬取并清洗大批量中文文本(Linux环境)

先说一下,虽然我是在linux里写的,但windows里一样可以用,方法一样的,只有selenium和一些别的包版本不一样

环境配置:

Ubuntu20.04

anaconda3创建的一个基本的3.8环境:

conda create -n 环境名 python=3.8

matplotlib:直接用pip在隔离好的环境里装就行了

唯一需要注意一下在linux环境下的字体配置,本文不在此详述,给一个参考博客:

搜集总结的一些matplotlib在linux服务器上的字体安装方案_centos matplotlib 字体库安装-CSDN博客requests:直接pip装就行了

selenium:安装这个需要注意几点:1.自己的浏览器版本,并下载对应驱动 2.驱动提取出来后放在系统环境变量里,给一个参考博客:

[How TO]-ubuntu下安装selenium_ubuntu安装selenium-CSDN博客

关于动态cookie的一些说明:

动态cookie是比较常见的网站反爬手段,每一次访问用户cookie都会实时变化,QQ小说就是典型的例子。目前csdn上有很多总结动态cookie的突破方案的,比如通过js逆向推导生成过程等等,但既然又selenium这么方便的手段何必要吃逆向的苦呢,我还看到一些博主使用selenium只是为了获得当前的cookie再用requests请求,这是完全错误的方案!我就想问问你们是怎么保证会话的一致性的?反正那几个博主的代码我都看过,没有一个是真的能爬出东西的

然后再展示一下我的方案,以QQ小说的免费小说为例(别问我说为什么不爬付费的)。然后说明一下:因为是完成一个课程的语料搜集作业,花了半天时间写一天时间调,就爬了差不多800多万字就停了,如果要直接用的话建议自己再改改,别照抄:

这个是主程序:

#coding:utf-8

import os
import requests
import re
import collections
import math
import matplotlib.pyplot as plt
from get_cookiejar import parse_cookie_string,cookies_dict_to_cookiejar
from webdriver import get_dynamic_url
import time
import random

INCRESE_WORDS=1024*1024 #增加2M需要增加的汉字数
total_count=0
#获取基础地址
base_url="https://book.qq.com/book-cate/0-0-0-1-0-0-0-1"
user_header={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:130.0) Gecko/20100101 Firefox/130.0"}
raw_cookie="RK=g59N6ZxTHA; ptcz=1fdcda0f61113531c6bd66dbc41a908fde778843866781c8323e4536dffe0628; logTrackKey=f210855df36a4bd7aa4ccaec2879b8c8; secToken=5fc356d2705353c4ff9ea5d3ab48050b; Hm_lvt_ab19911ff0c788613a31e85bb39089e2=1727182002; Hm_lpvt_ab19911ff0c788613a31e85bb39089e2=1727183248; HMACCOUNT=CC4A40E5BE8F6900; fuid=195c36c101864bbe8e8f3235fcc85264; accesstoken=84_VuG81L0Mtw86bkPn_0BklRsi3kGR9YBu28ircsZKYEmNvP-E4r8NDIqe4fTykvfXTjqQw1bHW5VEpUVJj3UmhDARr4w22mVOmFc14nFYzxM; loginType=2; openid=oriMkuGITCJoONr4lgJKy0VD9ugM; originloginType=19; ywguid=120475560916; ywkey=ywf3Wd5CjX8R; uid=120475560916; usid=ywf3Wd5CjX8R"
raw_cookiedict=parse_cookie_string(raw_cookie)
user_cookie=cookies_dict_to_cookiejar(raw_cookiedict)
#小心灾难性回溯,不要用.匹配空白符
href_pattern=re.compile(r"""<div\smulan-bid=(.*?)><a\stitle=(?P<novel_title>.*?)\shref=("(?P<refer_url>.*?)")(.*?)><(.*)>(.*?)\s+(.*?)\s+(.*?)<a(.*?)title=.*?第(?P<total_page>[0-9]*)""")

#定义连接异常
class ConnectError(Exception):
    def __init__(self,ErrorUrl):
        super().__init__(self)  # 初始化父类
        self.errorinfo="{}连接失败".format(ErrorUrl)
    def __str__(self):
        return self.errorinfo

class Novel():
    def __init__(self,title,url,chapter):
        self.novel_name=title
        self.goto_url="https:"+url
        self.total_chapter=chapter

#获取抓取的网页信息
def get_info(filename,goal_url,user_cookies,user_headers,waittime=5):
    try:
        response=requests.get(base_url,headers=user_headers,cookies=user_cookies,timeout=waittime)
        if not response.status_code==200:
            raise ConnectError(goal_url)
        with open(str(filename),mode='w',encoding='utf-8') as f:
            f.write(response.content.decode('utf-8'))
            f.close()
        response.close()
        print(f"{base_url}读取成功,{f.name}写入成功...")
    except ConnectError:
        print(ConnectError)
        exit()
    return filename

#提取网页信息
def html_info_process(filename,re_herf=re.compile(r".*",re.S),re_text=re.compile(r".*",re.S)):
    try:
        with open (str(filename),mode='r',encoding='utf-8') as f:
            html_content=f.read()
            

    except IOError("{}不存在".format(filename)):
        print(IOError)

def count_and_calculate_entropy(text):
    # 统计每个汉字的频率
    frequency = collections.Counter(text)
    # 计算总字数
    total_count = sum(frequency.values())
    # 计算信息熵
    entropy = 0.0
    for count in frequency.values():
        probability = count / total_count
        entropy -= probability * math.log(probability, 2)  # 使用底数为2的对数
    return frequency, entropy

def plot_frequency(frequency,entropy):
    sorted_frequency = dict(sorted(frequency.items(), key=lambda item: item[1], reverse=True))
    # 提取collection计数器返回的键值
    characters = list(sorted_frequency.keys())
    counts = list(sorted_frequency.values())
    #可视化词频
    plt.rcParams['font.family'] = 'sans-serif'
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    plt.figure(figsize=(10, 6))
    plt.bar(characters[0:10], counts[0:10], color='skyblue')
    plt.xlabel('汉字')
    plt.ylabel('频率')
    plt.title('汉字频率统计')
    plt.xticks(rotation=45)
    plt.show()

def main():
    global total_count,user_header,user_cookie
    incresed_word=0
    file=get_info(filename="total_info.html",goal_url=base_url,user_headers=user_header,user_cookies=user_cookie)
    with open(file,mode='r') as f:
        raw_html=f.read()
        novel_list=[]
        html_iter=re.finditer(href_pattern,raw_html)
        Content_re=re.compile(r"""<p>(?P<content>.*)</p></div>""",re.S)
        Incresed_word_re=re.compile(r"""</label>\s?<.*?class="ypc-link".*?<label.*?>本章字数:</label>\s?<span.*?>(?P<chapter_words>.*?)字</span>""")
        for iter in html_iter:
            Reference=iter.group("refer_url").split('/')
            # print(Reference)
            Url="//book.qq.com/book-read/"+Reference[-1]
            temp=Novel(title=iter.group("novel_title"),url=Url,chapter=iter.group("total_page"))
            novel_list.append(temp)
            print("小说名称:{}".format(iter.group("novel_title")))
            print("跳转网址:{}".format(iter.group("refer_url")))
            print("最多章数(params递增上界):{}".format(iter.group("total_page")))
        n=len(novel_list)
        for i in range(0,n):
            File_name=novel_list[i].novel_name
            book_url=novel_list[i].goto_url
            total_chapter=novel_list[i].total_chapter
            j=1
            while j <= int(total_chapter)+1:
                chapter_url=book_url+f'/{j}'
                print(f"当前正在采集{File_name}的第{j}章")
                html_string=get_dynamic_url(chapter_url)
                # print(type(html_string))
                # with open("try.html",mode='w',encoding='utf-8') as f:
                #     f.write(html_string)
                #     f.close()
                temp_content=re.search(Content_re,html_string)
                incresed_words_match=re.search(Incresed_word_re,html_string)
                incresed_words=int(incresed_words_match.group("chapter_words"))
                print(f"当前章节字数为:{incresed_words}")
                processed_html=temp_content.group("content")
                processed_html=processed_html.strip('-')
                cleaned_text = re.sub(r'</p><p>', ' ', processed_html)
                cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
                cleaned_text = re.sub(r'[A-Za-z0-9\W\s\d]*', '', cleaned_text)
                seed=random.seed(time.time())
                time.sleep(random.randint(1,10))
                with open ("content.html",mode='a',encoding='utf-8') as f:
                    f.write(cleaned_text)
                    f.close()
                incresed_word = incresed_word + incresed_words
                print(f"已累计总字数为:{incresed_word}")
                if incresed_word>=INCRESE_WORDS:
                    print("已提取到2M的语料")
                    total_count+1
                    capcity=2*total_count
                    with open("content.html",mode='r',encoding='utf-8') as f:
                        calcing_html=f.read()
                        Frequency,Entroy= count_and_calculate_entropy(calcing_html)
                        print(f"在{capcity}M下统计得到的信息熵为{Entroy}")
                        plot_frequency(Frequency,Entroy)
                        f.close()
                    if total_count>3:
                        print("收集结束,按任意键退出")
                        os.system("pause")
                        exit()
                    incresed_word=0
                j=j+1
            print(f"{File_name}抓取完毕")

if __name__=="__main__":
    main()
 

这个是打包cookiejar的程序:

import http.cookiejar
import requests
from http.cookiejar import Cookie

# 将 Cookie 字符串解析为字典
def parse_cookie_string(cookie_string):
    cookies = {}
    for cookie in cookie_string.split(';'):
        name, value = cookie.strip().split('=', 1)
        cookies[name] = value
    return cookies

# 将字典中的 cookies 转化为 CookieJar 对象
def cookies_dict_to_cookiejar(cookie_dict):
    cookie_jar = http.cookiejar.CookieJar()
    for name, value in cookie_dict.items():
        cookie = Cookie(
            version=0, name=name, value=value,
            port=None, port_specified=False,
            domain="", domain_specified=False, domain_initial_dot=False,
            path="/", path_specified=True,
            secure=False, expires=None, discard=True,
            comment=None, comment_url=None, rest={}, rfc2109=False
        )
        cookie_jar.set_cookie(cookie)
    return cookie_jar

最后这个是使用selenium模拟浏览器的程序:

from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
import time
import random

# 1. 设置Firefox浏览器选项
firefox_options = Options()
firefox_options.add_argument("--headless")  # 启用无头模式,后台运行不显示浏览器窗口
firefox_options.add_argument("--disable-gpu")  # 禁用GPU加速(可选)
gecko_service = Service("/usr/local/bin/geckodriver")

def get_dynamic_url(goal_url,status=True):
    driver = webdriver.Firefox(service=gecko_service, options=firefox_options)
    try:
        driver.get(goal_url)
    except:
        print("网络连接失败,正在记录当前位置,并重新启动抓取")
        driver.close()
        time.sleep(random.randint(1,10))
        html_string=get_dynamic_url(goal_url=goal_url)
        return html_string
    else:
        rand_seed=random.seed(time.time())
        time.sleep(random.randint(1,3))
        html_string=driver.page_source
        driver.close()
        return html_string

if __name__ == "__main__":
    html_string=get_dynamic_url("https://book.qq.com/book-read/50780743/1")
    print(html_string)

最后放一下效果,因为每次累计都要100多万字,所以整个程序运行起来时间会比较长,但不用担心中途的网络连接问题,相关的异常都在调试时处理好了。

2afb5b7013de4fb2b8ba3e221ac7301d.png

b12cceec8a304ed5b38757e379901e02.png

标签: linux selenium 运维

本文转载自: https://blog.csdn.net/m0_55324065/article/details/142612292
版权归原作者 kuki&&yuki 所有, 如有侵权,请联系我们删除。

“使用selenium突破QQ小说动态cookie,爬取并清洗大批量中文文本(Linux环境)”的评论:

还没有评论