0


Scrapy + selenium + 超级鹰验证码识别爬取网站

一、安装Scrapy

1,window安装

  1. pip install Scrapy

2,安装selenium

  1. pip install selenium

3,下载Chrome驱动

  1. a,查看Google Chrome浏览器版本
  2. Chrome驱动下载地址http://chromedriver.storage.googleapis.com/index.html

  1. b,找到和你版本最接近的哪个安装包

  1. c,下载好之后将我们的chromedriver放到和我们python安装路径相同的目录下

  1. d,配置环境变量

4,超级鹰验证码识别

  1. a,超级鹰官网 https://www.chaojiying.com/
  2. b,注册,登入
  3. c,生成软件id
  4. d,下载,放置到爬虫工程目录下

二、Scrapy项目生成

1,win + R

2,输入命令

  1. ​​# 切换到自己想要的路径 cd C:\Users\(用户名)\Desktop\spider
  2. # 创建工程 scrapy startproject (项目名)
  3. # 切换到新创建的文件夹 cd hellospider
  4. # 创建爬虫项目 scrapy genspider (爬虫名) (爬取网址的域名)

3,使用pycharm打开​​​​​​​

4, 修改为虚拟环境()

file->setting

pycharm里面的命令行,再次安装scrapy,selenium

三、爬取某个网站(以下我用之前的创建的项目,不是刚刚新创的)

1,修改setting

  1. # 修改机器人协议
  2. ROBOTSTXT_OBEY = False
  3. # 下载时间间隙
  4. DOWNLOAD_DELAY = 1
  5. # 启用后,当从相同的网站获取数据时,Scrapy将会等待一个随机的值,延迟时间为0.5到1.5之间的一个随机值乘以DOWNLOAD_DELAY
  6. RANDOMIZE_DOWNLOAD_DELAY=True
  7. # 若是请求时间超过60秒,就会报异常,异常机制是会再次发起请求的
  8. DOWNLOAD_TIMEOUT = 60
  9. # 设置请求头
  10. DEFAULT_REQUEST_HEADERS = {
  11. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  12. 'Accept-Language': 'en',
  13. 'User-Agent':'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'
  14. }
  15. # 打开一个管道
  16. ITEM_PIPELINES = {
  17. # '项目名称.pipelines.管道名': 300,
  18. 'chuxiongfilespider.pipelines.ChuxiongfilespiderPipeline': 300,
  19. }

2,items.py文件

定义需要的字段

3,写爬虫文件 chuxiongfile.py

  1. import copy
  2. from datetime import time
  3. import scrapy
  4. from pymysql.converters import escape_string
  5. from scrapy.http import HtmlResponse
  6. from selenium.common import NoSuchElementException
  7. from selenium.webdriver import Chrome
  8. from selenium.webdriver.common.by import By
  9. from chuxiongfilespider.items import ChuxiongfilespiderItem
  10. from chuxiongfilespider.spiders.chaojiying import Chaojiying_Client
  11. import uuid
  12. class ChuxiongfileSpider(scrapy.Spider):
  13. name = 'chuxiongfile'
  14. allowed_domains = ['网址']
  15. start_urls = [
  16. '爬取的网址']
  17. page = 1
  18. def start_requests(self):
  19. web = Chrome()
  20. web.get(self.start_urls[0])
  21. try:
  22. # selenium版本更新,原find_element_by_xpath需要改写,并导By包
  23. web.find_element(By.XPATH, '/html/body/form/div/img')
  24. # screenshot_as_png当前窗口的屏幕快照保存为二进制数据
  25. img = web.find_element(By.XPATH, '/html/body/form/div/img').screenshot_as_png
  26. # 超级鹰处理验证码
  27. chaojiying = Chaojiying_Client('超级鹰登入账号', '超级鹰登入密码', '软件id')
  28. # 1902处理验证码类型
  29. dic = chaojiying.PostPic(img, 1902)
  30. verify_code = dic['pic_str']
  31. # 填写验证码
  32. web.find_element(By.XPATH, '//*[@id="visitcode"]').send_keys(verify_code)
  33. # 点击确定
  34. time.sleep(2)
  35. web.find_element(By.XPATH, '/html/body/form/div/input[4]').click()
  36. # 获取验证码输入后的cookie
  37. cookies_dict = {cookie['name']: cookie['value'] for cookie in web.get_cookies()}
  38. web.close()
  39. yield scrapy.Request(url=self.start_urls[0], cookies=cookies_dict, callback=self.parse)
  40. except NoSuchElementException:
  41. yield scrapy.Request(url=self.start_urls[0], callback=self.parse)
  42. def parse(self, response: HtmlResponse, **kwargs):
  43. items = ChuxiongfilespiderItem()
  44. for item in response.css('.tml'):
  45. items['name'] = item.css('.tcc a::text').extract()[0]
  46. items['policy_id'] = ''.join(str(uuid.uuid5(uuid.NAMESPACE_DNS, items['name'])).split('-'))
  47. items['attachment_id'] = '123'
  48. items['url'] = response.urljoin(item.css('.tcc a::attr(href)').extract_first())
  49. if item.css('.d a::attr(href)').extract_first() == '':
  50. items['attachment_url'] = '无下载选项'
  51. else:
  52. items['attachment_url'] = response.urljoin(item.css('.d a::attr(href)').extract_first())
  53. items['netloc'] = '网址'
  54. yield scrapy.Request(url=items['url'], callback=self.get_details, meta={"items": copy.deepcopy(items)})
  55. def get_details(self, response):
  56. items = response.meta['items']
  57. items['content'] =escape_string(" ".join(response.css('.xzgfwrap').getall()))
  58. yield items
  59. if self.page < 2:
  60. self.page += 1
  61. url = f'http://(网址)?totalpage=3&PAGENUM={str(self.page)}&urltype' \
  62. f'=tree.TreeTempUrl&wbtreeid=3494'
  63. yield scrapy.Request(url=url, callback=self.parse) # 使用callback进行回调

4,存储到数据库 pipelines.py

  1. # Define your item pipelines here
  2. #
  3. # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  4. # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
  5. # useful for handling different item types with a single interface
  6. from itemadapter import ItemAdapter
  7. import pymysql
  8. class ChuxiongfilespiderPipeline(object):
  9. mysql = None
  10. cursor = None # 执行SQL语句返回游标接口
  11. def open_spider(self, spider):
  12. self.mysql = pymysql.Connect(host='localhost', user='数据库用户名', password='数据库用户密码', port=3306, charset='utf8',
  13. database='库名')
  14. self.cursor = self.mysql.cursor()
  15. def process_item(self, items, spider):
  16. # 创建表
  17. table = 'create table if not exists cx_other(' \
  18. 'id int not null primary key auto_increment' \
  19. ',policy_id varchar(100)' \
  20. ',url varchar(1000)' \
  21. ',attachment_id varchar(100)' \
  22. ',attachment_url varchar(100)' \
  23. ',name varchar(150)' \
  24. ',netloc varchar(50)' \
  25. ');'
  26. table_1 = 'create table if not exists cx_other_content(' \
  27. 'id int not null primary key auto_increment' \
  28. ',policy_id varchar(100)' \
  29. ',content MEDIUMTEXT NOT NULL' \
  30. ');'
  31. insert = 'insert into cx_other(policy_id,url,attachment_id,attachment_url,name,netloc) ' \
  32. 'values("%s","%s","%s","%s","%s","%s")' \
  33. % (items['policy_id'], items['url'], items['attachment_id'], items['attachment_url'], items['name'], items['netloc'])
  34. insert_1 = 'insert into cx_other_content(policy_id,content) values("%s","%s")' % (
  35. items['policy_id'], items['content'])
  36. try:
  37. # 数据库断开后重连
  38. self.mysql.ping(reconnect=True)
  39. # 创建表
  40. self.cursor.execute(table)
  41. self.cursor.execute(table_1)
  42. # 插入数据
  43. self.cursor.execute(insert)
  44. self.cursor.execute(insert_1)
  45. self.mysql.commit()
  46. print('===============插入数据成功===============')
  47. except Exception as e:
  48. print('===============插入数据失败===============', e)
  49. self.mysql.rollback()
  50. return items
  51. def close_spider(self, spider):
  52. self.cursor.close()
  53. self.mysql.close()
标签: scrapy selenium python

本文转载自: https://blog.csdn.net/weixin_41586246/article/details/126697920
版权归原作者 李甜甜~ 所有, 如有侵权,请联系我们删除。

“Scrapy + selenium + 超级鹰验证码识别爬取网站”的评论:

还没有评论