python爬虫selenium+scrapy常用功能笔记

爬虫Selenium+scrapy常用功能笔记

Selenium

常用包的导入

import re ,time ,json,os,random

from selenium import webdriver  # 导入 webdriverfrom selenium.webdriver.common.keys import Keys  # 要想调用键盘按键操作需要引入keys包 比如 回车和ctrl键from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait  # WebDriverWait 库，负责循环等待from selenium.webdriver.support import expected_conditions as EC  # expected_conditions 类，负责条件出发# 下面是解析和sql存储用的工具from lxml import etree
from pymysql import*

初始化配置和特征隐藏

chrome_options = Options()# chrome_options.add_argument("--headless")  # 设置为无头浏览器
chrome_options.add_argument("log-level=3")# 禁止掉浏览器调试的提示信息 有些网站console.log太多了
chrome_options.add_argument("disable-blink-features=AutomationControlled")# 这一行告诉chrome去掉机器人痕迹# 下面两行也可以手动去一部分
chrome_options.add_experimental_option('excludeSwitches',['enable-automation'])
chrome_options.add_experimental_option('useAutomationExtension',False)# executable_path 后值为驱动所在位置
driver = webdriver.Chrome(executable_path="chromedriver.exe",options=chrome_options)
start_url ="https://bot.sannysoft.com/"# 打开并访问网页
driver.get(start_url)

机器人特征检验

访问网址可以看到直观结果

https://bot.sannysoft.com/

显（隐）式等待

# 隐式等待
WebDriverWait(driver,8).until(
        EC.presence_of_element_located((By.XPATH,"//div[@class='job-list']//ul//li"))# 通过XPATH查找)# 显式等待
driver.implicitly_wait(0.2)

页面操作

获取页面dom

page_source = driver.page_source

页面元素获取

# 单一样式搜索
page_content = etree.HTML(driver.page_source)
content_now_page = page_content.xpath("//div[@class='job-list']//ul//li")
next_url =  response.xpath("(//div[@class='page_al']//a)[last()-1]/@href").get()# 多样式选择搜索  div[contains(@class,'job-limit')]# 使用括号包裹可以控制优先级 看情况使用
yao_qiu = page_content.xpath("(.//div[contains(@class,'job-limit')])//text()")

元素点击

element_next.click()

frame跳转

driver.switch_to.frame('editormd-image-iframe')

获取cookie

cookie = driver.get_cookies()# print("这是请求到的原始cookie",cookie)
cookies_list =[]for cookie_dict in cookie:
    cookie = cookie_dict['name']+'='+ cookie_dict['value']
    cookies_list.append(cookie)# print("拼接后的cookir",cookies_list)
cookies =';'.join(cookies_list)

给请求添加cookie

driver.add_cookie({"name":"_identity","value":"d192e16b"})

事件操作

点击

driver.find_element(By.CLASS_NAME,'add-img').click()

上传文件

driver.find_element(By.NAME,'editormd-image-file').send_keys(r"D:\img2.png")

退出页面

driver.quit()

Scrapy

多摘自之前文档
https://blog.csdn.net/weixin_43521165/article/details/111905800

初始创建命令

创建项目
scrapy startproject 爬虫项目名字 # 例如 scrapy startproject fang_spider
scrapy genspider 爬虫名字 ‘域名’ #例如 scrapy genspider fang ‘fang.com’

# 设置启动文件 在项目目录下建立就行 写入以下代码以后直接运行则可以启动爬虫# 这里第二行的 fang 是你创建的爬虫的名字from scrapy import cmdline
cmdline.execute("scrapy crawl fang".split())

常用请求头

需要更多可以点击去这里复制http://www.useragentstring.com

user_agent =["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36","Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2919.83 Safari/537.36","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2762.73 Safari/537.36","ozilla/5.0 (X11; Ubuntu; Linux i686 on x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2820.59 Safari/537.36"]

Parse解析

手动去重过滤

scrapy本身有链接去重功能，同样的链接不会重复访问。但是有些网站是在你请求A的时候重定向到B，重定向到B的时候又给你重定向回A，然后才让你顺利访问，此时scrapy由于默认去重，这样会导致拒绝访问A而不能进行后续操作.

解决方式：
在yield访问新链接时，加上 dont_filter=True 参数，不让它自动过滤

yield scrapy.Request(url=response.urljoin(next_url),callback=self.esf_parse,dont_filter=True)

meta传参

yield scrapy.Request(url=response.urljoin(next_url),headers=cooki,callback=self.esf_parse,
                     meta={'info':("123456", city),'cooki':cooki})

获取请求或者响应的cookie

# 获取请求的cookie
Cookie = response.request.headers.getlist('Cookie')# 请求# 获取相应的cookie
Cookie2 = response.headers.getlist('Set-Cookie')# 响应print(Cookie,'111cookoooo222:',Cookie2)

piplines.py 异步入库

只需要更改数据库的配置 item元素的获取和 sql语句即可

# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html# useful for handling different item types with a single interfacefrom twisted.enterprise import adbapi
from pymysql import cursors

classnewFangSpiderPipeline:
    x =0def__init__(self):
        dbparams ={'host':'127.0.0.1','port':3306,'user':'root','password':'password','database':'fangtianxia','charset':'utf8','cursorclass': cursors.DictCursor
        }
        self.dbpool = adbapi.ConnectionPool('pymysql',**dbparams)
        self._sql =None@propertydefsql(self):
        self.x +=1print('*-*'*10,'第{}条数据进来了++++++'.format(self.x))ifnot self._sql:
            self._sql ="""
            insert into newhouse(id,name ,province,city,price,areas,state ,style, address,ori_url) values
             (null,%s,%s,%s,%s,%s,%s,%s,%s,%s)
            """return self._sql
        return self._sql

    defprocess_item(self, item, spider):
        defer = self.dbpool.runInteraction(self.insert_item, item)
        defer.addErrback(self.handle_error, item, spider)definsert_item(self, cursor, item):
        cursor.execute(self.sql,(
        item['name'], item['province'], item['city'], item['price'], item['areas'], item['state'],
        item['style'],item['address'],item['ori_url']))# self.conn.commit()defhandle_error(self, error, item, spider):print('=^='*5,'error_start:')print(error)print('=^='*5,'error_end')

middlewares中间件使用selenium替代访问并获取cookie

# 可以写在一个新的中间件中并设置优先级classCookieSpiderDownloaderMiddleware:# Not all methods need to be defined. If a method is not defined,# scrapy acts as if the downloader middleware does not modify the# passed objects.# def __init__(self):defprocess_request(self, request, spider):# Called for each request that goes through the downloader# middleware.# Must either:# - return None: continue processing this request# - or return a Response object# - or return a Request object# - or raise IgnoreRequest: process_exception() methods of#   installed downloader middleware will be called# 打开并访问网页print("request.url:--",request.url)
        driver.get(request.url)# 隐式等待
        a = WebDriverWait(driver,8).until(
            EC.presence_of_element_located((By.XPATH,'//h3[contains(@class,"bili-video-card__info--tit")]')))# 通过XPATH查找if a:print("等到了a",a)
            source = driver.page_source
            # print("获取到的类型为",type(source))
            response = HtmlResponse(url=driver.current_url, body=source, request=request, encoding='utf8')return response
        else:returnNone

SQl

coon = connect(host='localhost', port=3306, db='boss_zhi_pin',
             user='root', passwd='password', charset='utf8')
cur = coon.cursor()

sql_order ="""insert into boss_position_4 (search_word,city_name,position_name ,company_name,areas,street,company_style,scale,
               xue_li,work_experience,financing_situation, benefits,salary,salary_month,
              salary_min,salary_max,average_salary) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
params1 =['岗位','城市','岗位具体名称','公司名称','所在区域','所在街道','公司类型','公司规模','学历要求','工作经验','融资情况','福利待遇','薪资（K）','发放月数','最低薪资','最高薪资','平均薪资']
cur.execute(sql_order, params1)# 这里第一次执行是建立字段名（可不写）
coon.commit()# 关闭连接
cur.close()
coon.close()

ip池子

ip_source =[{"ip":"27.9.47.216","port":4220},{"ip":"183.166.135.43","port":4226},{"ip":"101.74.181.221","port":4245},{"ip":"175.147.100.112","port":4260},{"ip":"115.205.77.140","port":4286},{"ip":"113.237.4.211","port":4256},{"ip":"116.115.209.201","port":4245},{"ip":"175.174.190.95","port":4251},{"ip":"106.112.124.153","port":4278},{"ip":"125.79.200.156","port":4281}]

标签： python

本文转载自: https://blog.csdn.net/weixin_43521165/article/details/130159357
版权归原作者 浪淘三千 所有，如有侵权，请联系我们删除。

python爬虫selenium+scrapy常用功能笔记

爬虫Selenium+scrapy常用功能笔记

Selenium

常用包的导入

初始化配置和特征隐藏

机器人特征检验

显（隐）式等待

页面操作

获取页面dom

页面元素获取

元素点击

frame跳转

获取cookie

给请求添加cookie

事件操作

点击

上传文件

退出页面

Scrapy

初始创建命令

常用请求头

Parse解析

手动去重过滤

meta传参

获取请求或者响应的cookie

piplines.py 异步入库

middlewares中间件使用selenium替代访问并获取cookie

SQl

ip池子

发表评论

“python爬虫selenium+scrapy常用功能笔记”的评论:

关于作者

overfit同步小助手

相关阅读

文章导航

python爬虫selenium+scrapy常用功能笔记

爬虫Selenium+scrapy常用功能笔记

Selenium

常用包的导入

初始化配置 和 特征隐藏

机器人特征检验

显（隐）式等待

页面操作

获取页面dom

页面元素获取

元素点击

frame跳转

获取cookie

给请求添加cookie

事件操作

点击

上传文件

退出页面

Scrapy

初始创建命令

常用请求头

Parse解析

手动去重过滤

meta传参

获取请求或者响应的cookie

piplines.py 异步入库

middlewares中间件使用selenium替代访问并获取cookie

SQl

ip池子

发表评论

“python爬虫selenium+scrapy常用功能笔记”的评论:

关于作者

overfit同步小助手

相关阅读

文章导航

初始化配置和特征隐藏