爬虫-python -综合练习-51job信息-滑块验证-selenium

利用selenium爬51job职位信息-破解滑块验证

1.爬51job职位信息

0.头文件

需要用到以下文件

from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from PIL import Image
import csv

1.初始化

防止打印一些无用或者显示错误的日志。
wait是为后面wait.until做准备。

# 初始化# 防止打印一些无用的日志
option = webdriver.ChromeOptions()
option.add_experimental_option("excludeSwitches",['enable-automation','enable-logging'])
web = Chrome(options = option)# 设置等待超时
wait = WebDriverWait(web,20)

2.页面登陆

这个没什么好说的，打开页面以及输入账号密码点确定。

# 登录deflogin():
    web.get(url)
    web.maximize_window()#窗口最大化
    time.sleep(2)#登录
    web.find_element(By.ID,'loginname').send_keys('账户')
    web.find_element(By.ID,'password').send_keys('密码')
    web.find_element(By.ID,'isread_em').click()
    web.find_element(By.ID,'login_btn_withPwd').click()
    time.sleep(2)

3.滑块验证

重点的来了，滑块验证需要分成两部分，
第一部分为将验证滑块的图片的，带缺口图片和完整背景图片保存。
第二部分比对这两张图片，并计算缺口位置距图片左边界的距离，
第三部分为模拟人工滑动滑块。
1.保存带缺口图片和完整背景图片
这里保存了四张图片，其中51job_slice是没有用的，51job图片是为后面处理可以图片截取部分。
其中1.25系数是将图片与网页上原图做对比后得出的系数。
设置可见与不可见，对应网站更新后的代码内，验证图片的路径，可以通过浏览器工具获得，类似这样验证码这里是不变的，所以可以直接复制。

#对某元素截图   defsave_pic(obj,name):try:
        pic_url=web.save_screenshot('.\\51job.png')print("%s:截图成功!"% pic_url)#获取元素位置信息
        left = obj.location['x']*1.25#自己通过原图与实际图片对比得出的系数
        top = obj.location['y']*1.25
        right = left + obj.size['width']*1.25
        bottom = top + obj.size['height']*1.25print('图：'+name)print('Left %s'% left)print('Top %s'% top)print('Right %s'% right)print('Bottom %s'% bottom)print('')
         
        im = Image.open('.\\51job.png')
        im = im.crop((left, top, right, bottom))#元素裁剪
        file_name='51job_'+name+'.png'
        im.save(file_name)#元素截图except BaseException as msg:print("%s:截图失败!"% msg)#设置元素可见    defshow_element(element):
    web.execute_script("arguments[0].style=arguments[1]",element,"display: block;")#设置元素不可见defhide_element(element):
    web.execute_script("arguments[0].style=arguments[1]",element,"display: none;")defcut():
    c_background=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'canvas.geetest_canvas_bg.geetest_absolute')))
    c_slice=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'canvas.geetest_canvas_slice.geetest_absolute')))
    c_full_bg=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'canvas.geetest_canvas_fullbg.geetest_fade.geetest_absolute')))
    hide_element(c_slice)
    save_pic(c_background,'back')#隐藏滑块
    show_element(c_slice)
    save_pic(c_slice,'slice')#所有的
    show_element(c_full_bg)
    save_pic(c_full_bg,'full')#隐藏所有的

2.计算移动距离

将图一中1移动到到2 相当于比对缺口图与背景图的从左向右第一个不同的点的位置。

# 判断像素是否相同defis_pixel_equal(bg_image, fullbg_image, x, y):"""
    :param bg_image: (Image)缺口图片
    :param fullbg_image: (Image)完整图片
    :param x: (Int)位置x
    :param y: (Int)位置y
    :return: (Boolean)像素是否相同
    """# 获取缺口图片的像素点(按照RGB格式)
    bg_pixel = bg_image.load()[x, y]# 获取完整图片的像素点(按照RGB格式)
    fullbg_pixel = fullbg_image.load()[x, y]# 设置一个判定值，像素值之差超过判定值则认为该像素不相同
    threshold =20# 判断像素的各个颜色之差，abs()用于取绝对值if(abs(bg_pixel[0]- fullbg_pixel[0]< threshold)andabs(bg_pixel[1]- fullbg_pixel[1]< threshold)andabs(bg_pixel[2]- fullbg_pixel[2]< threshold)):# 如果差值在判断值之内，返回是相同像素returnTrueelse:# 如果差值在判断值之外，返回不是相同像素returnFalse# 计算滑块移动距离defget_distance(bg_image, fullbg_image):'''
    :param bg_image: (Image)缺口图片
    :param fullbg_image: (Image)完整图片
    :return: (Int)缺口离滑块的距离
    '''# 滑块的初始位置
    distance =60# 遍历像素点横坐标for i inrange(distance, fullbg_image.size[0]):# 遍历像素点纵坐标for j inrange(fullbg_image.size[1]):# 如果不是相同像素ifnot is_pixel_equal(fullbg_image, bg_image, i, j):# 返回此时横轴坐标就是滑块需要移动的距离return i

3.模拟现实移动滑块
这里的滑块会验证是否为机器操作，如果被发现了，即使移动位置正确，也会出现‘怪物吃了拼图请重试’。
其实这里很简单，主要看滑块移动时候，是否在中间断开，其实可以一下子拉到某个位置，然后稍微调整。

#破解滑块验证defslide():
    distance=get_distance(Image.open('.\\51job_back.png'),Image.open('.\\51job_full.png'))/1.25#要将原图与实际图对比的系数除掉try:
        slider=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div.geetest_slider_button')))#找到滑块if slider:print("====有滑块验证=====")
            action_chains = webdriver.ActionChains(web)# 点击，准备拖拽
            action_chains.click_and_hold(slider)
            action_chains.pause(0.2)
            action_chains.move_by_offset(distance-10,0)
            action_chains.pause(0.6)
            action_chains.move_by_offset(10,0)#添加修正过程
            action_chains.pause(0.6)
            action_chains.release()
            action_chains.perform()# 释放滑块
            time.sleep(5)else:print("===没有滑块验证===")except Exception as e:print("==="+str(e))

4.获取网站的职位信息

获取分为两步，第一步定位到某个页page_nums=1,第二步将网页内的有用数据保存。
将page_nums变大，同时点击下一页。

#取的页面职位信息defget_data(page_nums):
    lists = web.find_element(By.CLASS_NAME,'j_joblist').find_elements(By.CLASS_NAME,'e')withopen('data_job.csv',mode='a',encoding='utf-8',newline='')as f:
        csv_w = csv.writer(f)forlistin lists:  
            job_name =list.find_element(By.CLASS_NAME,'jname').text
            com_name =list.find_element(By.CLASS_NAME,'cname').text
            sal_val =list.find_element(By.CLASS_NAME,'sal').text
            din =list.find_element(By.CLASS_NAME,'d').text 
            csv_w.writerow([job_name,com_name,sal_val,din])print(page_nums,'over!!')#搜索框输入python开始搜索前max_page页面defsearch(input_str,max_page):
    web.find_element(By.ID,'kwdselectid').send_keys(input_str)
    web.find_element(By.XPATH,'/html/body/div[3]/div/div[1]/div/button').click()
    time.sleep(0.5)
    page_nums =1while page_nums<=max_page:
        get_data(page_nums)
        page_nums+=1
        web.find_element(By.CLASS_NAME,'p_in').click()
        time.sleep(1)

2.爬51job页面-利用selenium-破解滑块验证-完整代码

from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from PIL import Image
import csv
# 初始化# 防止打印一些无用的日志
option = webdriver.ChromeOptions()
option.add_experimental_option("excludeSwitches",['enable-automation','enable-logging'])
web = Chrome(options = option)# 设置等待超时
wait = WebDriverWait(web,20)# 登录deflogin():
    web.get(url)
    web.maximize_window()#窗口最大化
    time.sleep(2)#登录
    web.find_element(By.ID,'loginname').send_keys('账号')
    web.find_element(By.ID,'password').send_keys('密码')
    web.find_element(By.ID,'isread_em').click()
    web.find_element(By.ID,'login_btn_withPwd').click()
    time.sleep(2)#对某元素截图   defsave_pic(obj,name):try:
        pic_url=web.save_screenshot('.\\51job.png')print("%s:截图成功!"% pic_url)#获取元素位置信息
        left = obj.location['x']*1.25#自己通过原图与实际图片对比得出的系数
        top = obj.location['y']*1.25
        right = left + obj.size['width']*1.25
        bottom = top + obj.size['height']*1.25print('图：'+name)print('Left %s'% left)print('Top %s'% top)print('Right %s'% right)print('Bottom %s'% bottom)print('')
         
        im = Image.open('.\\51job.png')
        im = im.crop((left, top, right, bottom))#元素裁剪
        file_name='51job_'+name+'.png'
        im.save(file_name)#元素截图except BaseException as msg:print("%s:截图失败!"% msg)#设置元素可见    defshow_element(element):
    web.execute_script("arguments[0].style=arguments[1]",element,"display: block;")#设置元素不可见defhide_element(element):
    web.execute_script("arguments[0].style=arguments[1]",element,"display: none;")defcut():
    c_background=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'canvas.geetest_canvas_bg.geetest_absolute')))
    c_slice=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'canvas.geetest_canvas_slice.geetest_absolute')))
    c_full_bg=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'canvas.geetest_canvas_fullbg.geetest_fade.geetest_absolute')))
    hide_element(c_slice)
    save_pic(c_background,'back')#隐藏滑块
    show_element(c_slice)
    save_pic(c_slice,'slice')#所有的
    show_element(c_full_bg)
    save_pic(c_full_bg,'full')#隐藏所有的# 判断像素是否相同defis_pixel_equal(bg_image, fullbg_image, x, y):"""
    :param bg_image: (Image)缺口图片
    :param fullbg_image: (Image)完整图片
    :param x: (Int)位置x
    :param y: (Int)位置y
    :return: (Boolean)像素是否相同
    """# 获取缺口图片的像素点(按照RGB格式)
    bg_pixel = bg_image.load()[x, y]# 获取完整图片的像素点(按照RGB格式)
    fullbg_pixel = fullbg_image.load()[x, y]# 设置一个判定值，像素值之差超过判定值则认为该像素不相同
    threshold =20# 判断像素的各个颜色之差，abs()用于取绝对值if(abs(bg_pixel[0]- fullbg_pixel[0]< threshold)andabs(bg_pixel[1]- fullbg_pixel[1]< threshold)andabs(bg_pixel[2]- fullbg_pixel[2]< threshold)):# 如果差值在判断值之内，返回是相同像素returnTrueelse:# 如果差值在判断值之外，返回不是相同像素returnFalse# 计算滑块移动距离defget_distance(bg_image, fullbg_image):'''
    :param bg_image: (Image)缺口图片
    :param fullbg_image: (Image)完整图片
    :return: (Int)缺口离滑块的距离
    '''# 滑块的初始位置
    distance =60# 遍历像素点横坐标for i inrange(distance, fullbg_image.size[0]):# 遍历像素点纵坐标for j inrange(fullbg_image.size[1]):# 如果不是相同像素ifnot is_pixel_equal(fullbg_image, bg_image, i, j):# 返回此时横轴坐标就是滑块需要移动的距离return i
#破解滑块验证defslide():
    distance=get_distance(Image.open('.\\51job_back.png'),Image.open('.\\51job_full.png'))/1.25#要将原图与实际图对比的系数除掉try:
        slider=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div.geetest_slider_button')))#找到滑块if slider:print("====有滑块验证=====")
            action_chains = webdriver.ActionChains(web)# 点击，准备拖拽
            action_chains.click_and_hold(slider)
            action_chains.pause(0.2)
            action_chains.move_by_offset(distance-10,0)
            action_chains.pause(0.6)
            action_chains.move_by_offset(10,0)#添加修正过程
            action_chains.pause(0.6)
            action_chains.release()
            action_chains.perform()# 释放滑块
            time.sleep(5)else:print("===没有滑块验证===")except Exception as e:print("==="+str(e))#取的页面职位信息defget_data(page_nums):
    lists = web.find_element(By.CLASS_NAME,'j_joblist').find_elements(By.CLASS_NAME,'e')withopen('data_job.csv',mode='a',encoding='utf-8',newline='')as f:
        csv_w = csv.writer(f)forlistin lists:  
            job_name =list.find_element(By.CLASS_NAME,'jname').text
            com_name =list.find_element(By.CLASS_NAME,'cname').text
            sal_val =list.find_element(By.CLASS_NAME,'sal').text
            din =list.find_element(By.CLASS_NAME,'d').text 
            csv_w.writerow([job_name,com_name,sal_val,din])print(page_nums,'over!!')#搜索框输入python开始搜索前max_page页面defsearch(input_str,max_page):
    web.find_element(By.ID,'kwdselectid').send_keys(input_str)
    web.find_element(By.XPATH,'/html/body/div[3]/div/div[1]/div/button').click()
    time.sleep(0.5)
    page_nums =1while page_nums<=max_page:
        get_data(page_nums)
        page_nums+=1
        web.find_element(By.CLASS_NAME,'p_in').click()
        time.sleep(1)if __name__=='__main__':
    url ='https://login.51job.com/login.php?loginway=0&isjump=0&lang=c&from_domain=i&url=http%3A%2F%2Fwww.51job.com%2F'
    login()
    cut()
    slide()
    search('python',10)
    web.close()

标签： python 爬虫 selenium

本文转载自: https://blog.csdn.net/luxppp880/article/details/122582807
版权归原作者 朗风风 所有，如有侵权，请联系我们删除。

爬虫-python -综合练习-51job信息-滑块验证-selenium

利用selenium爬51job职位信息-破解滑块验证

1.爬51job职位信息

0.头文件

1.初始化

2.页面登陆

3.滑块验证

4.获取网站的职位信息

2.爬51job页面-利用selenium-破解滑块验证-完整代码

发表评论

“爬虫-python -综合练习-51job信息-滑块验证-selenium”的评论:

关于作者

overfit同步小助手

相关阅读

文章导航