先说一下,虽然我是在linux里写的,但windows里一样可以用,方法一样的,只有selenium和一些别的包版本不一样
环境配置:
Ubuntu20.04
anaconda3创建的一个基本的3.8环境:
conda create -n 环境名 python=3.8
matplotlib:直接用pip在隔离好的环境里装就行了
唯一需要注意一下在linux环境下的字体配置,本文不在此详述,给一个参考博客:
搜集总结的一些matplotlib在linux服务器上的字体安装方案_centos matplotlib 字体库安装-CSDN博客requests:直接pip装就行了
selenium:安装这个需要注意几点:1.自己的浏览器版本,并下载对应驱动 2.驱动提取出来后放在系统环境变量里,给一个参考博客:
[How TO]-ubuntu下安装selenium_ubuntu安装selenium-CSDN博客
关于动态cookie的一些说明:
动态cookie是比较常见的网站反爬手段,每一次访问用户cookie都会实时变化,QQ小说就是典型的例子。目前csdn上有很多总结动态cookie的突破方案的,比如通过js逆向推导生成过程等等,但既然又selenium这么方便的手段何必要吃逆向的苦呢,我还看到一些博主使用selenium只是为了获得当前的cookie再用requests请求,这是完全错误的方案!我就想问问你们是怎么保证会话的一致性的?反正那几个博主的代码我都看过,没有一个是真的能爬出东西的
然后再展示一下我的方案,以QQ小说的免费小说为例(别问我说为什么不爬付费的)。然后说明一下:因为是完成一个课程的语料搜集作业,花了半天时间写一天时间调,就爬了差不多800多万字就停了,如果要直接用的话建议自己再改改,别照抄:
这个是主程序:
#coding:utf-8
import os
import requests
import re
import collections
import math
import matplotlib.pyplot as plt
from get_cookiejar import parse_cookie_string,cookies_dict_to_cookiejar
from webdriver import get_dynamic_url
import time
import random
INCRESE_WORDS=1024*1024 #增加2M需要增加的汉字数
total_count=0
#获取基础地址
base_url="https://book.qq.com/book-cate/0-0-0-1-0-0-0-1"
user_header={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:130.0) Gecko/20100101 Firefox/130.0"}
raw_cookie="RK=g59N6ZxTHA; ptcz=1fdcda0f61113531c6bd66dbc41a908fde778843866781c8323e4536dffe0628; logTrackKey=f210855df36a4bd7aa4ccaec2879b8c8; secToken=5fc356d2705353c4ff9ea5d3ab48050b; Hm_lvt_ab19911ff0c788613a31e85bb39089e2=1727182002; Hm_lpvt_ab19911ff0c788613a31e85bb39089e2=1727183248; HMACCOUNT=CC4A40E5BE8F6900; fuid=195c36c101864bbe8e8f3235fcc85264; accesstoken=84_VuG81L0Mtw86bkPn_0BklRsi3kGR9YBu28ircsZKYEmNvP-E4r8NDIqe4fTykvfXTjqQw1bHW5VEpUVJj3UmhDARr4w22mVOmFc14nFYzxM; loginType=2; openid=oriMkuGITCJoONr4lgJKy0VD9ugM; originloginType=19; ywguid=120475560916; ywkey=ywf3Wd5CjX8R; uid=120475560916; usid=ywf3Wd5CjX8R"
raw_cookiedict=parse_cookie_string(raw_cookie)
user_cookie=cookies_dict_to_cookiejar(raw_cookiedict)
#小心灾难性回溯,不要用.匹配空白符
href_pattern=re.compile(r"""<div\smulan-bid=(.*?)><a\stitle=(?P<novel_title>.*?)\shref=("(?P<refer_url>.*?)")(.*?)><(.*)>(.*?)\s+(.*?)\s+(.*?)<a(.*?)title=.*?第(?P<total_page>[0-9]*)""")
#定义连接异常
class ConnectError(Exception):
def __init__(self,ErrorUrl):
super().__init__(self) # 初始化父类
self.errorinfo="{}连接失败".format(ErrorUrl)
def __str__(self):
return self.errorinfo
class Novel():
def __init__(self,title,url,chapter):
self.novel_name=title
self.goto_url="https:"+url
self.total_chapter=chapter
#获取抓取的网页信息
def get_info(filename,goal_url,user_cookies,user_headers,waittime=5):
try:
response=requests.get(base_url,headers=user_headers,cookies=user_cookies,timeout=waittime)
if not response.status_code==200:
raise ConnectError(goal_url)
with open(str(filename),mode='w',encoding='utf-8') as f:
f.write(response.content.decode('utf-8'))
f.close()
response.close()
print(f"{base_url}读取成功,{f.name}写入成功...")
except ConnectError:
print(ConnectError)
exit()
return filename
#提取网页信息
def html_info_process(filename,re_herf=re.compile(r".*",re.S),re_text=re.compile(r".*",re.S)):
try:
with open (str(filename),mode='r',encoding='utf-8') as f:
html_content=f.read()
except IOError("{}不存在".format(filename)):
print(IOError)
def count_and_calculate_entropy(text):
# 统计每个汉字的频率
frequency = collections.Counter(text)
# 计算总字数
total_count = sum(frequency.values())
# 计算信息熵
entropy = 0.0
for count in frequency.values():
probability = count / total_count
entropy -= probability * math.log(probability, 2) # 使用底数为2的对数
return frequency, entropy
def plot_frequency(frequency,entropy):
sorted_frequency = dict(sorted(frequency.items(), key=lambda item: item[1], reverse=True))
# 提取collection计数器返回的键值
characters = list(sorted_frequency.keys())
counts = list(sorted_frequency.values())
#可视化词频
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(10, 6))
plt.bar(characters[0:10], counts[0:10], color='skyblue')
plt.xlabel('汉字')
plt.ylabel('频率')
plt.title('汉字频率统计')
plt.xticks(rotation=45)
plt.show()
def main():
global total_count,user_header,user_cookie
incresed_word=0
file=get_info(filename="total_info.html",goal_url=base_url,user_headers=user_header,user_cookies=user_cookie)
with open(file,mode='r') as f:
raw_html=f.read()
novel_list=[]
html_iter=re.finditer(href_pattern,raw_html)
Content_re=re.compile(r"""<p>(?P<content>.*)</p></div>""",re.S)
Incresed_word_re=re.compile(r"""</label>\s?<.*?class="ypc-link".*?<label.*?>本章字数:</label>\s?<span.*?>(?P<chapter_words>.*?)字</span>""")
for iter in html_iter:
Reference=iter.group("refer_url").split('/')
# print(Reference)
Url="//book.qq.com/book-read/"+Reference[-1]
temp=Novel(title=iter.group("novel_title"),url=Url,chapter=iter.group("total_page"))
novel_list.append(temp)
print("小说名称:{}".format(iter.group("novel_title")))
print("跳转网址:{}".format(iter.group("refer_url")))
print("最多章数(params递增上界):{}".format(iter.group("total_page")))
n=len(novel_list)
for i in range(0,n):
File_name=novel_list[i].novel_name
book_url=novel_list[i].goto_url
total_chapter=novel_list[i].total_chapter
j=1
while j <= int(total_chapter)+1:
chapter_url=book_url+f'/{j}'
print(f"当前正在采集{File_name}的第{j}章")
html_string=get_dynamic_url(chapter_url)
# print(type(html_string))
# with open("try.html",mode='w',encoding='utf-8') as f:
# f.write(html_string)
# f.close()
temp_content=re.search(Content_re,html_string)
incresed_words_match=re.search(Incresed_word_re,html_string)
incresed_words=int(incresed_words_match.group("chapter_words"))
print(f"当前章节字数为:{incresed_words}")
processed_html=temp_content.group("content")
processed_html=processed_html.strip('-')
cleaned_text = re.sub(r'</p><p>', ' ', processed_html)
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
cleaned_text = re.sub(r'[A-Za-z0-9\W\s\d]*', '', cleaned_text)
seed=random.seed(time.time())
time.sleep(random.randint(1,10))
with open ("content.html",mode='a',encoding='utf-8') as f:
f.write(cleaned_text)
f.close()
incresed_word = incresed_word + incresed_words
print(f"已累计总字数为:{incresed_word}")
if incresed_word>=INCRESE_WORDS:
print("已提取到2M的语料")
total_count+1
capcity=2*total_count
with open("content.html",mode='r',encoding='utf-8') as f:
calcing_html=f.read()
Frequency,Entroy= count_and_calculate_entropy(calcing_html)
print(f"在{capcity}M下统计得到的信息熵为{Entroy}")
plot_frequency(Frequency,Entroy)
f.close()
if total_count>3:
print("收集结束,按任意键退出")
os.system("pause")
exit()
incresed_word=0
j=j+1
print(f"{File_name}抓取完毕")
if __name__=="__main__":
main()
这个是打包cookiejar的程序:
import http.cookiejar
import requests
from http.cookiejar import Cookie
# 将 Cookie 字符串解析为字典
def parse_cookie_string(cookie_string):
cookies = {}
for cookie in cookie_string.split(';'):
name, value = cookie.strip().split('=', 1)
cookies[name] = value
return cookies
# 将字典中的 cookies 转化为 CookieJar 对象
def cookies_dict_to_cookiejar(cookie_dict):
cookie_jar = http.cookiejar.CookieJar()
for name, value in cookie_dict.items():
cookie = Cookie(
version=0, name=name, value=value,
port=None, port_specified=False,
domain="", domain_specified=False, domain_initial_dot=False,
path="/", path_specified=True,
secure=False, expires=None, discard=True,
comment=None, comment_url=None, rest={}, rfc2109=False
)
cookie_jar.set_cookie(cookie)
return cookie_jar
最后这个是使用selenium模拟浏览器的程序:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
import time
import random
# 1. 设置Firefox浏览器选项
firefox_options = Options()
firefox_options.add_argument("--headless") # 启用无头模式,后台运行不显示浏览器窗口
firefox_options.add_argument("--disable-gpu") # 禁用GPU加速(可选)
gecko_service = Service("/usr/local/bin/geckodriver")
def get_dynamic_url(goal_url,status=True):
driver = webdriver.Firefox(service=gecko_service, options=firefox_options)
try:
driver.get(goal_url)
except:
print("网络连接失败,正在记录当前位置,并重新启动抓取")
driver.close()
time.sleep(random.randint(1,10))
html_string=get_dynamic_url(goal_url=goal_url)
return html_string
else:
rand_seed=random.seed(time.time())
time.sleep(random.randint(1,3))
html_string=driver.page_source
driver.close()
return html_string
if __name__ == "__main__":
html_string=get_dynamic_url("https://book.qq.com/book-read/50780743/1")
print(html_string)
最后放一下效果,因为每次累计都要100多万字,所以整个程序运行起来时间会比较长,但不用担心中途的网络连接问题,相关的异常都在调试时处理好了。
版权归原作者 kuki&&yuki 所有, 如有侵权,请联系我们删除。