0


【测试】selenium拦截Ajax(XHR)等异步请求数据

1.说明

在爬取某个网站的时候遇到加密参数,由于js代码经过混淆编译不好破解,所以采用selenium的方式获取参数,但是我们获取selenium的数据基本上都是基于页面的,对于网站发起的异步请求,我们可以从日志中提取

2.设置driver参数

我们首先要通过Option对象(比如说ChromeOptions)设置监控浏览器日志,旧版本的Selenium是通过DesiredCapabilities设置的,下面是新版本的写法

from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait

options = ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--allow-running-insecure-content")
options.add_argument("--ignore-certificate-errors")
options.add_argument("--disable-single-click-autofill")
options.add_argument("--disable-autofill-keyboard-accessory-view[8]")
options.add_argument("--disable-full-form-autofill-ios")
options.add_experimental_option('perfLoggingPrefs',{'enableNetwork':True,'enablePage':False,})
options.set_capability("goog:loggingPrefs",{'browser':'ALL','performance':'ALL',})
options.set_capability("goog:perfLoggingPrefs",{'enableNetwork':True,'enablePage':False,'enableTimeline':False})

3.请求网页

现在实例化一个driver,发起一个网页请求,我这里使用

WebDriverWait

显式等待的方式等待某个元素出现,你也可以隐式等待或者直接sleep,如果你不等待,异步请求还没加载完就开始获取,你可能会拿不到想要的数据

service = Service(executable_path=executable_path)
driver = Chrome(service=service, options=options)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument",{"source":"""Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"""})
driver.get(page_url)
wait = WebDriverWait(driver,15,0.5)try:
    wait.until(expected_conditions.presence_of_element_located((By.CLASS_NAME,"item ")))except Exception as e:print("WebDriverWait.until timeout error: {}".format(e))
html = driver.execute_script("return document.documentElement.outerHTML")

4.处理日志

访问一下driver的

log_types

属性可以获取到所有日志类型,遍历它,通过

get_log()

方法获取对应的日志,之后再过滤出自己想要的日志就行。

比如说,我这里是过滤出所有

Network.requestWillBeSent

的日志,即发送异步请求的数据,因为我需要该请求的请求头,如果是响应类型的日志(

Network.responseReceived

),它只包含响应头。具体支持的类型可以参考谷歌devtools的文档

如果需要过滤出Ajax(XHR)请求,可以根据日志的params里的type进行判断,也可以通过它判断

sign_dict =dict()# 用来存储自己想要的数据for log_type in driver.log_types:
    perf_list = driver.get_log(log_type)for row_log in perf_list:try:
            log_json = json.loads(row_log['message'])
            message_log = log_json['message']except Exception as e:print(e)continueif message_log.get('method')!='Network.requestWillBeSent':continueif message_log.get("params",{}).get("type","").upper()!="XHR":continue
        headers = message_log['params'].get('request',{}).get('headers')ifnot headers:continue
        x_sign = headers.get('X-Sign')ifnot x_sign:continue
        x_app_id = headers.get('X-AppID')
        x_ts = headers.get('X-Ts')print("success:", x_sign, x_app_id, x_ts)
        req_url = message_log['params'].get('request',{}).get('url')
        key = os.path.split(req_url.split("?")[0])[1]
        sign_dict[key]={"X-AppID": x_app_id,"X-Sign": x_sign,"X-Ts": x_ts}

注意,如果你想要响应体,

Network.responseReceived

类型的日志的

response

字段是没有响应体的,你需要通过

params

字段里的

requestId

获取,参考代码如下

res_body_dict =dict()for log_type in driver.log_types:
    perf_list = driver.get_log(log_type)for row_log in perf_list:try:
            log_json = json.loads(row_log['message'])
            message_log = log_json['message']except Exception as e:print(e)continueif message_log.get('method')!='Network.responseReceived':continueif message_log.get("params",{}).get("type","").upper()!="XHR":continue
        request_id = message_log['params'].get("requestId")ifnot request_id:continue
        req_url = message_log['params'].get('response',{}).get('url')
        key = os.path.split(req_url.split("?")[0])[1]
        content = driver.execute_cdp_cmd('Network.getResponseBody',{'requestId': request_id})
        body =Nonetry:
            body = json.loads(content["body"])except Exception as e:print("get_unisat_data_by_selenium() json loads error: {}, content:{}".format(e, content))
        res_body_dict[key]= body

5.完整代码

上面的完整参考代码如下

import json
import os.path

from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait

defget_selenium_driver(executable_path=r"E:\webdriver\chromedriver.exe"):
    options = ChromeOptions()
    options.add_argument("--no-sandbox")
    options.add_argument("--allow-running-insecure-content")
    options.add_argument("--ignore-certificate-errors")
    options.add_argument("--disable-single-click-autofill")
    options.add_argument("--disable-autofill-keyboard-accessory-view[8]")
    options.add_argument("--disable-full-form-autofill-ios")
    options.add_experimental_option('perfLoggingPrefs',{'enableNetwork':True,'enablePage':False,})
    options.set_capability("goog:loggingPrefs",{'browser':'ALL','performance':'ALL',})
    options.set_capability("goog:perfLoggingPrefs",{'enableNetwork':True,'enablePage':False,'enableTimeline':False})
    service = Service(executable_path=executable_path)
    driver = Chrome(service=service, options=options)
    driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument",{"source":"""Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"""})return driver

defget_sign_by_selenium(page_url):
    driver = get_selenium_driver()
    driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument",{"source":"""Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"""})
    driver.get(page_url)
    wait = WebDriverWait(driver,15,0.5)try:
        wait.until(expected_conditions.presence_of_element_located((By.CLASS_NAME,"item ")))except Exception as e:print("WebDriverWait.until timeout error: {}".format(e))# html = driver.execute_script("return document.documentElement.outerHTML")# with open(r"C:\Users\admin\Desktop\test\test.html", "w") as f:#     f.write(html)# time.sleep(10)
    sign_dict =dict()for log_type in driver.log_types:
        perf_list = driver.get_log(log_type)for row_log in perf_list:try:
                log_json = json.loads(row_log['message'])
                message_log = log_json['message']except Exception as e:print(e)continueif message_log.get('method')!='Network.requestWillBeSent':continueif message_log.get("params",{}).get("type","").upper()!="XHR":continue
            headers = message_log['params'].get('request',{}).get('headers')ifnot headers:continue
            x_sign = headers.get('X-Sign')ifnot x_sign:continue
            x_app_id = headers.get('X-AppID')
            x_ts = headers.get('X-Ts')print("success:", x_sign, x_app_id, x_ts)
            req_url = message_log['params'].get('request',{}).get('url')
            key = os.path.split(req_url.split("?")[0])[1]
            sign_dict[key]={"X-AppID": x_app_id,"X-Sign": x_sign,"X-Ts": x_ts}return sign_dict

defget_unisat_data_by_selenium(page_url):
    driver = get_selenium_driver()
    driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument",{"source":"""Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"""})
    driver.get(page_url)
    wait = WebDriverWait(driver,15,0.5)try:
        wait.until(expected_conditions.presence_of_element_located((By.CLASS_NAME,"item ")))except Exception as e:print("WebDriverWait.until timeout error: {}".format(e))
    res_body_dict =dict()for log_type in driver.log_types:
        perf_list = driver.get_log(log_type)for row_log in perf_list:try:
                log_json = json.loads(row_log['message'])
                message_log = log_json['message']except Exception as e:print(e)continueif message_log.get('method')!='Network.responseReceived':continueif message_log.get("params",{}).get("type","").upper()!="XHR":continue
            request_id = message_log['params'].get("requestId")ifnot request_id:continue
            req_url = message_log['params'].get('response',{}).get('url')
            key = os.path.split(req_url.split("?")[0])[1]
            content = driver.execute_cdp_cmd('Network.getResponseBody',{'requestId': request_id})
            body =Nonetry:
                body = json.loads(content["body"])except Exception as e:print("get_unisat_data_by_selenium() json loads error: {}, content:{}".format(e, content))
            res_body_dict[key]= body
    return res_body_dict

if __name__ =='__main__':
    url ="https://unisat.io/brc20?q=bc1pkmnh3nj89uns3yp2mtqqxjns65vy6ca6n5jvp4s8ua8nke69cnjs987vtp"print("get_sign_by_selenium(url):", get_sign_by_selenium(url))# print("get_unisat_data_by_selenium(url):", get_unisat_data_by_selenium(url))

附:关于selenium的使用可以参考之前的文章
【测试】Selenium的使用(常用属性方法、元素等待、操作cookie、操作元素、无头模式、获取HTML源码)
【测试】selenium反爬操作
【测试】修改selenium选项配置参数优化性能
【测试】在Linux(CentOS、Ubuntu)无界面服务器使用selenium
【测试】Selenium操作Cookie


本文转载自: https://blog.csdn.net/qq_39147299/article/details/132236245
版权归原作者 冰冷的希望 所有, 如有侵权,请联系我们删除。

“【测试】selenium拦截Ajax(XHR)等异步请求数据”的评论:

还没有评论