最近看到一个好玩的库,声称碾压selenium,怀着好奇的心打开了一个新世界。选动态网页试试收,使用DrissionPage写了一个自动化爬虫(爬取丽人网图片下载到本地)
本文包含主要技术(DrissionPage用法,动态页面ajax应对方法,xpath解析,异步协程保存图片)
import time
from DrissionPage import ChromiumPage
import aiohttp
import asyncio
import os
from DrissionPage import ChromiumOptions
from functools import wraps
path = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe" # 请改为你电脑内Chrome可执行文件路径
ChromiumOptions().set_browser_path(path).save()
class Spider:
def __init__(self, name="利世", start_page=1, all_page=None):
self.page = ChromiumPage()
self.name: str = name
self.file_div_path: str = name
self.index = 0
self.start_page = start_page #开始页
self.all_page = all_page #总页数
if not os.path.exists(self.file_div_path):
os.makedirs(self.file_div_path)
def start(self):
self.page.get("https://spacemiss.com/")
ele = self.page.ele("#tdb-search-form-input-tdi_28")
ele.input(self.name)
ele = self.page.ele(".wpb_button wpb_btn-inverse tdb-search-form-btn")
ele.click()
def analysis_first_page(self):
height = self.page.run_js_loaded("return document.body.scrollHeight")
# 异步加载
# 循环滚动页面
while True:
# 滚动到页面底部
self.page.run_js_loaded("window.scrollTo(0, document.body.scrollHeight);")
# 等待页面加载
time.sleep(4)
# 计算新的页面高度并与之前的比较
new_height = self.page.run_js_loaded("return document.body.scrollHeight")
if new_height == height:
break
height = new_height
links = self.page.eles(
"xpath:/html/body/div[6]/div[2]/div/div/div/div[2]/div/div/div/div/div[1]/div"
)
print(f"共找到{len(links)}个结果")
i = 0
for link in links[self.start_page - 1 :]:
a = link.ele("xpath:./div/div[1]/div/a")
url = a.attr("href")
tab_page = self.page.new_tab()
tab_page.get(url)
self.analysis_second_page(tab_page)
tab_page.close()
i += 1
if self.all_page and i >= self.all_page:
break
def analysis_second_page(self, tab_page):
while True:
try:
time.sleep(3)
title = tab_page.ele('xpath://h1[@class="tdb-title-text"]')
break
except:
tab_page.refresh()
imgs = tab_page.eles(
"xpath:/html/body/div[6]/div[2]/div/div/article/div/div/div[4]/div/div[2]/div/div/div[2]/img"
)
urls = []
for img in imgs:
img_url = img.attr("src")
urls.append(img_url)
title = title.text.strip()
title = title.replace(" ", "")
title = title.replace(".", "")
title = title.replace("|", "")
file_path = self.file_div_path + "/" + title
if not os.path.exists(file_path):
os.makedirs(file_path)
else:
return
print(f"{title}共找到{len(urls)}张图片")
tasks = [self.save_img(url, title) for url in urls]
# 异步下载图片
async def main():
await asyncio.gather(*tasks)
asyncio.run(main())
def close(self):
self.page.close()
async def save_img(self, img_url: str, title: str):
retry_count = 3
while retry_count > 0:
try:
async with aiohttp.ClientSession(
timeout=aiohttp.ClientTimeout(total=60)
) as session:
async with session.get(img_url) as resp:
save_path = (
self.file_div_path + f"/{title}" + f"/{img_url[-7:]}"
)
with open(save_path, "wb") as f:
while True:
try:
chunk = await resp.content.read(1024)
if not chunk:
break
f.write(chunk)
except:
break
break
except:
print(f"服务器断开连接,正在重新连接{3-retry_count + 1}...")
retry_count -= 1
await asyncio.sleep(3) # 等待5秒后再次尝试
def run(self):
self.start()
self.analysis_first_page()
self.close()
spider = Spider(name="日奈娇", start_page=1)
spider.run()
网络不好下载图片可能出问题,下载图片重试三次
版权归原作者 清长忆 所有, 如有侵权,请联系我们删除。