0


04 python38的scrapy和selenium处理异步加载的动态html页面

1 异步加载的html页面,页面源代码数据xpath是找不到的

1.0 网站分析

  1. #淘宝搜索页网址:https://s.taobao.com/search?q=手机
  2. #搜索列表页分析:
  3. 第一页:https://s.taobao.com/search?q=手机
  4. 第二页:都是ajax请求生成
  5. 最后一页:都是ajax请求生成
  6. 请求方式get
  7. 返回数据为html

1.1 创建项目

  1. scrapy startproject taobaoSpider
  2. cd ssqSpider
  3. scrapy genspider taobao taobao.com

1.2 创建爬虫

  1. scrapy genspider taobao "taobao.com"

1.3 添加工具函数模块utils.py

  1. from selenium import webdriver
  2. from selenium.webdriver.chrome.service import Service
  3. from selenium.common import exceptions
  4. import json
  5. def create_chrome_driver(headless=False):
  6. options=webdriver.ChromeOptions()
  7. if headless:
  8. options.add_argument('--headless')
  9. #去掉chrome正在受到测试软件的控制的提示条
  10. options.add_experimental_option('excludeSwitches',['enable-automation'])
  11. options.add_experimental_option('useAutomationExtension',False)
  12. options.add_argument("--disable-blink-features=AutomationControlled")
  13. # 定义chrome驱动去地址
  14. service = Service('chromedriver.exe')
  15. # service=Service(r'E:\项目区\项目2023-编程项目教程\ssq_caipiao_pachong\taobaoSpider\chromedriver.exe')
  16. browser=webdriver.Chrome(service=service,options=options)
  17. #反爬,修改为navigator.webdriver=undefined
  18. browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
  19. 'source':'Object.defineProperty(navigator,"webdriver",{get:()=>undefined})'})
  20. return browser
  21. def add_cookies(browser,cookie_file):
  22. with open(cookie_file,'r') as file:
  23. cookies_list=json.load(file)
  24. for cookie_dict in cookies_list:
  25. if cookie_dict['secure']:
  26. try:
  27. browser.add_cookie(cookie_dict)
  28. except exceptions.InvalidCookieDomainException as e:
  29. print(e.msg)
  30. def test():
  31. print("ggggggg")

1.4 测试淘宝页面反爬机制

1.4.1 taobao_login.py模拟登陆生成cookies.json

  1. from utils import create_chrome_driver,add_cookies,test
  2. import json
  3. import time
  4. from selenium.webdriver.common.by import By
  5. browser=create_chrome_driver()
  6. time.sleep(1)
  7. # browser.get('https://taobao.com')
  8. # time.sleep(1)
  9. # el=browser.find_element(by=By.XPATH,value='//*[@id="q"]')
  10. # el.send_keys('手机')
  11. # time.sleep(1)
  12. # el=browser.find_element(by=By.XPATH,value='//*[@id="J_TSearchForm"]/div[1]/button')
  13. # el.click()
  14. # time.sleep(1)
  15. # # #滚动到底部
  16. # # js="window.scrollTo(0,450);"
  17. # # driver.execute_script(js)
  18. # # sleep(3)
  19. # # 或者
  20. # js = "var q=document.documentElement.scrollTop=4514"
  21. # browser.execute_script(js)
  22. # time.sleep(1)
  23. # # 点击下一页
  24. # el=browser.find_element(by=By.XPATH,value='//*[@id="root"]/div/div[3]/div[1]/div[1]/div[2]/div[4]/div/div/button[2]')
  25. # el.click()
  26. # time.sleep(1)
  27. # browser.get('https://s.taobao.com/search?commend=all&ie=utf8&initiative_id=tbindexz_20170306&q=%E6%89%8B%E6%9C%BA&search_type=item&sourceId=tb.index&spm=a21bo.jianhua.201856-taobao-item.2&ssid=s5-e')
  28. #进入登陆页面
  29. browser.get('https://login.taobao.com/member/login.jhtml')
  30. time.sleep(1)
  31. el=browser.find_element(by=By.XPATH,value='//*[@id="fm-login-id"]')
  32. el.send_keys('123@qq.com')
  33. el=browser.find_element(by=By.XPATH,value='//*[@id="fm-login-password"]')
  34. el.send_keys('123456')
  35. el=browser.find_element(by=By.XPATH,value='//*[@id="login-form"]/div[4]/button')
  36. el.click()
  37. time.sleep(6)
  38. #保存cookie
  39. with open('taobao_cookie.json','w') as file:
  40. json.dump(browser.get_cookies(), file)
  41. time.sleep(1)
  42. # print(browser.page_source)
  43. browser.get('https://s.taobao.com/search?q=手机')
  44. time.sleep(1)
  45. time.sleep(600)

1.4.2 taobao_login_after.py淘宝登陆后测试

  1. from utils import create_chrome_driver,add_cookies,test
  2. import json
  3. import time
  4. from selenium.webdriver.common.by import By
  5. browser=create_chrome_driver()
  6. time.sleep(1)
  7. # 先访问下页面再设置cookie,否则报错
  8. browser.get('https://taobao.com')
  9. time.sleep(1)
  10. add_cookies(browser,'taobao_cookie.json')
  11. time.sleep(1)
  12. browser.get('https://s.taobao.com/search?q=手机')
  13. time.sleep(1)
  14. time.sleep(600)

1.5 修改下载中间件

  1. from scrapy import signals
  2. from scrapy.http import HtmlResponse
  3. import time
  4. # useful for handling different item types with a single interface
  5. from itemadapter import is_item, ItemAdapter
  6. from utils import create_chrome_driver,add_cookies
  7. from taobaoSpider.spiders.taobao import TaobaoSpider
  8. class TaobaospiderDownloaderMiddleware:
  9. # Not all methods need to be defined. If a method is not defined,
  10. # scrapy acts as if the downloader middleware does not modify the
  11. # passed objects.
  12. @classmethod
  13. def from_crawler(cls, crawler):
  14. # This method is used by Scrapy to create your spiders.
  15. s = cls()
  16. crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
  17. return s
  18. def __init__(self):
  19. self.browser=create_chrome_driver()
  20. self.browser.get('https://www.taobao.com')
  21. add_cookies(self.browser, 'taobao_cookie.json')
  22. def __del__(self):
  23. self.browser.close()
  24. pass
  25. def process_request(self, request, spider):
  26. # Called for each request that goes through the downloader
  27. # middleware.
  28. # Must either:
  29. # - return None: continue processing this request
  30. # - or return a Response object
  31. # - or return a Request object
  32. # - or raise IgnoreRequest: process_exception() methods of
  33. # installed downloader middleware will be called
  34. if not isinstance(spider, TaobaoSpider):
  35. return None
  36. else:
  37. self.browser.get(request.url)
  38. time.sleep(2)
  39. # # #滚动到底部
  40. # js="window.scrollTo(0,450);"
  41. # self.browser.execute_script(js)
  42. # sleep(3)
  43. # # 或者
  44. # js = "var q=document.documentElement.scrollTop=4514"
  45. # self.browser.execute_script(js)
  46. # time.sleep(2)
  47. # 慢慢滚动
  48. for i in range(45,4514,400):
  49. js = f"var q=document.documentElement.scrollTop={i}"
  50. self.browser.execute_script(js)
  51. time.sleep(0.5)
  52. # # 点击下一页
  53. # el=browser.find_element(by=By.XPATH,value='//*[@id="root"]/div/div[3]/div[1]/div[1]/div[2]/div[4]/div/div/button[2]')
  54. # el.click()
  55. # time.sleep(1)
  56. return HtmlResponse(url=request.url,body=self.browser.page_source,
  57. request=request,encoding='utf-8')
  58. def process_response(self, request, response, spider):
  59. # Called with the response returned from the downloader.
  60. # Must either;
  61. # - return a Response object
  62. # - return a Request object
  63. # - or raise IgnoreRequest
  64. return response
  65. def process_exception(self, request, exception, spider):
  66. # Called when a download handler or a process_request()
  67. # (from other downloader middleware) raises an exception.
  68. # Must either:
  69. # - return None: continue processing this exception
  70. # - return a Response object: stops process_exception() chain
  71. # - return a Request object: stops process_exception() chain
  72. pass
  73. def spider_opened(self, spider):
  74. spider.logger.info("Spider opened: %s" % spider.name)

修改下载中间件配置

  1. DOWNLOADER_MIDDLEWARES = {
  2. "taobaoSpider.middlewares.TaobaospiderDownloaderMiddleware": 543,
  3. }

1.6 修改爬虫代码

1.6.1 添加数据模型

  1. import scrapy
  2. class TaobaospiderItem(scrapy.Item):
  3. # define the fields for your item here like:
  4. # name = scrapy.Field()
  5. title = scrapy.Field()
  6. pass

1.6.2 修改爬虫代码

  1. import scrapy
  2. from scrapy import Request
  3. from scrapy.http import HtmlResponse
  4. from taobaoSpider.items import TaobaospiderItem
  5. class TaobaoSpider(scrapy.Spider):
  6. name = "taobao"
  7. allowed_domains = ["taobao.com"]
  8. # start_urls = ["https://taobao.com"]
  9. def start_requests(self):
  10. keywords=['手机','笔记本电脑','键鼠套装']
  11. keywords=['手机']
  12. for keyword in keywords:
  13. for page in range(1):
  14. url=f'https://s.taobao.com/search?q={keyword}&s={page*48}'
  15. yield Request(url)
  16. def parse(self, response:HtmlResponse):
  17. # print(response.text)
  18. cssitem_list=response.xpath('//*[@id="root"]/div/div[3]/div[1]/div[1]/div[2]/div[3]/div/div')
  19. # print(len(cssitem_list))
  20. for cssitem in cssitem_list:
  21. item=TaobaospiderItem()
  22. item['title']=cssitem.xpath('./a/div/div[1]/div[2]/div/span/text()').extract()
  23. yield item
  24. pass

1.6.3 测试运行爬虫

  1. scrapy crawl taobao #正式运行
  2. 或者
  3. scrapy crawl taobao -o taobao.csv
标签: scrapy selenium html

本文转载自: https://blog.csdn.net/qq_42574478/article/details/132916657
版权归原作者 海纳百川程序员 所有, 如有侵权,请联系我们删除。

“04 python38的scrapy和selenium处理异步加载的动态html页面”的评论:

还没有评论