0


爬虫selenium-java

爬虫java的实现


文章目录


前言

1 selenium-java+httpclient实现爬取页面,并且通过jdbc批量插入mysql
2 可解决开启请求监控,自动获取token,ajax数据加密返回,无法直接拿数据等问题
3 chromedriver的使用自行百度(

如果步骤全对,还报错,请用管理员权限运行你开发工具

)
4 注意:以下代码为demo,需自己根据实际业务修改


一、selenium-java是什么?

示例:selenium-java

二、使用步骤

爬虫目录结构

在这里插入图片描述

引入库

mavne依赖:

<dependency><groupId>org.seleniumhq.selenium</groupId><artifactId>selenium-java</artifactId><version>4.5.3</version></dependency><dependency><groupId>commons-io</groupId><artifactId>commons-io</artifactId><version>2.11.0</version></dependency><dependency><groupId>org.apache.httpcomponents</groupId><artifactId>httpclient</artifactId><version>4.5.13</version></dependency><dependency><groupId>org.projectlombok</groupId><artifactId>lombok</artifactId><version>1.18.22</version><scope>provided</scope></dependency><dependency><groupId>com.fasterxml.jackson.core</groupId><artifactId>jackson-databind</artifactId><version>2.11.1</version></dependency><dependency><groupId>com.fasterxml.jackson.core</groupId><artifactId>jackson-core</artifactId><version>2.11.1</version></dependency><dependency><groupId>com.fasterxml.jackson.core</groupId><artifactId>jackson-annotations</artifactId><version>2.11.1</version></dependency><dependency><groupId>mysql</groupId><artifactId>mysql-connector-java</artifactId><version>8.0.23</version></dependency>

主方法代码

代码如下(示例):

package test;

import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.io.FileUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.devtools.DevTools;
import org.openqa.selenium.devtools.v106.network.Network;
import org.openqa.selenium.devtools.v106.network.model.Headers;
import org.openqa.selenium.devtools.v106.network.model.ResourceType;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;

import entity.Fa;
import util.MyHttpUtil;
import util.MySqlStrategy;
import util.SerializableUtil;
import util.Utils;/**
 * 
 * 
 * @author admin
 *
 */
public class CrawlerTest {
    private static String token ="xxxx";
    final static String driverAddr ="C:\\Users\\admin\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe";//登录
    final static String url1 ="https://xxxxx/system/login?";//获取详情接口
    final static String querySaasUrlTemplate ="https://xxxxxx?id=#{id}";
    final static String url2 ="https://xxxxxx?";
    final static String userName ="uername";
    final static String passWord ="password";
    final static File idCacheFile = new File("id.bat");//搜索名称持久化文件
    final static File searchNameFile = new File("searchName.bat");
    final static Set<String> idSet=getCacheSet(idCacheFile);
    final static Set<String> searchNameSet=getCacheSet(searchNameFile);

    public staticvoidmain(String[] args){
        System.setProperty("webdriver.chrome.driver", driverAddr);// 设置浏览器options
        ChromeOptions options = new ChromeOptions();// 关闭界面上的---Chrome正在受到自动软件的控制
        options.setExperimentalOption("excludeSwitches", new String[]{"enable-automation"});
        ChromeDriver driver = new ChromeDriver(options);
        Map<String, Object> command = new HashMap<>();// window.navigator.webdirver
        command.put("source","Object.defineProperty(navigator, 'webdriver', {get: () => undefined})");
        driver.executeCdpCommand("Page.addScriptToEvaluateOnNewDocument", command);// driver.executeScript("https://raw.githubusercontent.com/wendux/Ajax-hook/master/dist/ajaxhook.min.js");// driver.get("htps://www.baidu.com");// 首先登录
        driver.get(url1);
        driver.manage().window().maximize();
        Utils.sleep(5000);// 设置用户名
        driver.findElement(By.xpath("//*[@id=\"phone_number\"]")).sendKeys(userName);
        Utils.sleep(1000);// 设置密码
        driver.findElement(By.xpath("//*[@id=\"password\"]")).sendKeys(passWord);
        Utils.sleep(1000);// 勾选同意
        driver.findElement(By.xpath("//*[@id=\"agreement\"]")).click();
        Utils.sleep(1000);// 登录
        driver.findElement(
                By.xpath("//*[@id=\"root\"]/div/div[2]/div[1]/div[2]/div/div/form/div[4]/div/div/div/button")).click();// 获取window窗口句柄
        String handel1 = driver.getWindowHandle();
        Utils.sleep(1000);
        System.out.println("登录成功");
        Utils.sleep(3000);
        driver.get(url1);
        Utils.sleep(3000);// 打开一个新窗口
        String js ="window.open(\""+ url2 +"\");";((JavascriptExecutor) driver).executeScript(js);
        Utils.sleep(2000);// 切换窗口
        Object[] obj = driver.getWindowHandles().toArray();// 监听数据(下标为1的窗口)createRequestListener(1, driver);
        driver.switchTo().window(obj[1].toString());
        Utils.sleep(1000);//
        String searchName="搜索名称";//已经爬取过,不在获取if(searchNameSet.contains(searchName)){
            System.out.println(searchName+":已经处理过");return;}
        driver.findElement(By.xpath("//*[@id=\"name\"]")).sendKeys(searchName);// 查询
        driver.findElement(By.xpath("//*[@id=\"root\"]/section/section/div[2]/div/div[1]/div/form/div[6]/button")).click();
        Utils.sleep(2000);
        WebElement webElement = null;
        try {// 通过是否有下一页按钮,判断是否有数据(没有数据,这一行会抛出异常退出)
            webElement = driver.findElement(By
                    .xpath("//*[@id=\"root\"]/section/section/div[2]/div/div[3]/div/div/div/div/div/ul/li[5]/button"));}catch(Exception exception){// 跳出循环
            System.out.println("没有数据");}// 一个字处理完所有数据插入数据库
        List<Fa> faList = new ArrayList<>();// 为了防止死循环,最多1000次for(int i =0; i <1000; i++){// 第一次数据不点击if(i !=0){// 分页处理----// 判断是否有可以点击
                Boolean isEnabled = webElement.isEnabled();if(isEnabled){// 可以点击
                    webElement.click();// 点击完休眠等待
                    Utils.sleep(2000);}else{// 不可以点击说明下一页处理完毕break;}// 每次点击后休眠2秒,取数据}// 说明有数据,直接获取
            WebElement tableWebElement = driver.findElement(By.xpath("//*[@id=\"root\"]/section/section/div[2]/div/div[3]/div/div/div/div/div/div/div/div/table/tbody"));
            List<WebElement> trList = tableWebElement.findElements(By.tagName("tr"));
            System.out.println("");
            System.out.println("当前数据页数:"+(i +1));for(WebElement element : trList){
                Utils.sleep(500);// System.out.println(element.getText().replace(" ", ""));// 获取详情数据按钮// WebElement// detailElement=element.findElement(By.xpath("//*[@id=\"root\"]/section/section/div[2]/div/div[3]/div/div/div/div/div/div/div/div/table/tbody/tr[1]/td[9]/div/span[1]"));// detailElement.click();// 等待获取json数据完成//判断该条数据是否已经完成
                String detailId = element.getAttribute("data-row-key");if(idSet.contains(detailId)){//该条数据已经处理continue;}// 单位
                String unit = element.findElement(By.xpath("//td[5]")).getText().replace(" ","");// 国家名称
                String countriesName = element.findElement(By.xpath("//td[7]")).getText().replace(" ","");// 通过获取的id发送http请求
                String querySaasUrl = querySaasUrlTemplate.replace("#{id}", detailId);
                String result = MyHttpUtil.getRequest(token, querySaasUrl);// json解析数据
                ObjectMapper mapper = new ObjectMapper();// 定义一个转化对象
                try {
                    JsonNode jsonNode = mapper.readTree(result);if("200".equals(String.valueOf(jsonNode.get("code")))){
                        JsonNode dataNode = jsonNode.get("data");
                        System.out.println(dataNode);
                        Fa fa = mapper.readValue(dataNode.toString(), Fa.class);
                        fa.setUnit(unit);
                        fa.setCountriesName(countriesName);// 筛入ajax返回的所有数据
                        fa.setRowData(dataNode.toString());
                        faList.add(fa);}else{
                        System.out.println("获取json数据失败");
                        System.out.println(jsonNode.toPrettyString());
                        System.exit(0);}}catch(Exception e){
                    System.out.print("数据解析异常:");
                    e.printStackTrace();// 退出
                    System.exit(0);}}// System.out.println(tableWebElement.getText());}// 插入数据到mysqlif(!faList.isEmpty()){
           MySqlStrategy.insertValue(faList);}//将本次跑的参数缓存
        searchNameSet.add(searchName);for(Fa factory:faList){
            idSet.add(factory.getRowId());}//序列化
        SerializableUtil.serialization(searchNameFile,searchNameSet);
        SerializableUtil.serialization(idCacheFile, idSet);//
        try {
            Thread.currentThread().join();}catch(InterruptedException e){
            e.printStackTrace();}}/**
     * 切换多个窗口需要多个监听
     * 
     * @param i      窗口下标(只区哪个窗口监控的数据,无实际意义)
     * @param driver
     */
    private staticvoidcreateRequestListener(int i, ChromeDriver driver){
        DevTools devTools = driver.getDevTools();
        devTools.createSession();
        devTools.send(
                Network.enable(java.util.Optional.empty(), java.util.Optional.empty(), java.util.Optional.empty()));
        devTools.addListener(Network.requestWillBeSent(), res ->{
            Utils.sleep(10);
            System.out.println("RequestHeaders:"+ res.getRequest().getHeaders());
            System.out.println("RequestHeaders:"+ res.getRequest().getUrl());
            Headers header = res.getRequest().getHeaders();synchronized(CrawlerTest.class){if(header.containsKey("Authorization")){
                    token =(String) header.get("Authorization");// 获取token后销毁改监视器
                    devTools.close();
                    System.out.println("获取到了token:"+ token);}}});}/**
     * 根据url获取ajax数据
     * 
     * @param pattern
     * @param callback
     */
    public staticvoidinterceptResponseXHRByUrl(int i, DevTools devTools){
        devTools.addListener(Network.responseReceived(), responseReceived ->{
            try {if(ResourceType.XHR != responseReceived.getType()||!responseReceived.getResponse().getUrl().contains("/xxxxxx")){return;}// 取类型为XHR
                String data ="监控数据"+ i +":"+ responseReceived.getType()+":"+ responseReceived.getResponse().getUrl();
                Utils.sleep(2);
                FileUtils.write(new File("log/re.txt"), data,"UTF-8", true);
                FileUtils.write(new File("log/re.txt"),"\r\n","UTF-8", true);
                devTools.send(Network.getResponseBody(responseReceived.getRequestId()));}catch(Exception e){
                e.printStackTrace();} finally {}});}/**
     * 创建一个set集合
     * @return
     */
    private static Set<String>getCacheSet(File file){//
        Set<String> set=new LinkedHashSet<>();//反序列化值
        Set<String>  cacheSet=SerializableUtil.deserialization(file, set);if(cacheSet!=null){
            set=cacheSet;}return set;}}

封装数据实体类

代码如下(示例):

package entity;

import java.util.List;

import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.annotation.JsonProperty;

import lombok.Data;

@Data
@JsonIgnoreProperties(ignoreUnknown = true)
public class Fa{
   private String rowData;
   @JsonProperty("id")
   private String rowId;
   private String unit;
   private String countriesName;
   private List<FaDetail> detailData;}

封装数据实体类

代码如下(示例):

package entity;

import com.fasterxml.jackson.annotation.JsonIgnoreProperties;

import lombok.Data;

@Data
@JsonIgnoreProperties(ignoreUnknown = true)
public class FaDetail {
    private Long faId;
    private String type;}

工具类(Config)

代码如下(示例):

package util;

public class Config {//驱动,8.0固定为该格式
    public static final String JDBC_DRIVER ="com.mysql.cj.jdbc.Driver";//数据库地址,修改该数据库名称
    public static final String DB_URL ="jdbc:mysql://192.168.111.102:3306/crawler?useSSL=false&allowPublicKeyRetrieval=true&serverTimezone=Asia/Shanghai";//用户名
    public static final String USER ="root";//密码
    public static final String PASSWORD ="Sailing123`";}

工具类(MyHttpUtil)

代码如下(示例):

package util;
import org.apache.http.ParseException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class MyHttpUtil {
    private static final String token="a7ee88f8-21d6-4b1d-bfa8-ff478a473304+1000001239406480";
    private static final String url="https://xxxxxx?id=xxxxxx";
    private static final CloseableHttpClient closeableHttpClient = HttpClients.createDefault();
    public staticvoidmain(String[] args){getRequest(token,url);}
    public static String getRequest(String token,String url){
         HttpGet httpGet=new HttpGet(url.toString());
         httpGet.setHeader("authorization", token);
         try {
            CloseableHttpResponse closeableHttpResponse = closeableHttpClient.execute(httpGet);
            String responseString= EntityUtils.toString(closeableHttpResponse.getEntity());return responseString;}catch(ParseException | IOException e){
            e.printStackTrace();
            System.out.println("请求数据出错,请排查问题");
            System.exit(1);}finally {//将连接放回连接池中(下次重新使用)
            httpGet.releaseConnection();}return null;}}

MySqlStrategy

代码如下(示例):

package util;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;

import entity.Fa;
import entity.FaDetail;

public class MySqlStrategy {
    private final static String url = Config.DB_URL;
    private final static String user = Config.USER;
    private final static String password = Config.PASSWORD;
    private static Connection conn =getConnection();// ALTER TABLE factor AUTO_INCREMENT=1;
    public staticvoidmain(String[] args){insertValue(null);}

    private static Connection getConnection(){
        try {
            conn = DriverManager.getConnection(url, user, password);}catch(SQLException e){
            e.printStackTrace();}return conn;}

    public staticvoidinsertValue(List<Fa> datalist){
        String sql ="insert into fa values(?,?,?,?,?)";
        String gasSql ="insert into fa_detail values(?,?,?)";
        try {
            conn.setAutoCommit(false);}catch(SQLException e2){
            e2.printStackTrace();}try(PreparedStatement statement = conn.prepareStatement(sql, PreparedStatement.RETURN_GENERATED_KEYS);
                PreparedStatement detailStatement = conn.prepareStatement(gasSql,
                        PreparedStatement.RETURN_GENERATED_KEYS)){for(int i =0; i < datalist.size(); i++){
                Fa fa = datalist.get(i);creatFaParam(fa, statement);
                statement.addBatch();}
            statement.executeBatch();
            ResultSet generatedKeys = statement.getGeneratedKeys();
            List<Long> idList = new ArrayList<>();while(generatedKeys.next()){
                idList.add(generatedKeys.getLong(1));}//关闭该结果集close(null,null,generatedKeys);// 给子表插入主表idfor(int i =0; i < datalist.size(); i++){
                Fa factory = datalist.get(i);
                List<FaDetail> detailList = factory.getDetailData();if(detailList != null){for(FaDetail gas : detailList){
                        gas.setFaId(idList.get(i));// 准备批量数据creatFaDetailParam(detailStatement, gas);
                        detailStatement.addBatch();}}}// 对子表进行批量插入
            detailStatement.executeBatch();
            conn.commit();}catch(Exception e1){//回滚
            try {
                conn.rollback();}catch(SQLException e){}//说明有重复的key,直接返回if(e1.getMessage().contains("Duplicate entry")){return;}else{//退出程序,排查问题
                e1.printStackTrace();
                System.exit(1);}}}
    
    private staticvoidcreatFaDetailParam(PreparedStatement statement, FaDetail detail) throws SQLException {
        statement.setString(1, null);
        statement.setLong(2, detail.getFaId());
        statement.setString(3, detail.getType());}

    private staticvoidcreatFaParam(Fa fa, PreparedStatement statement) throws SQLException {
        statement.setString(1, null);
        statement.setString(2, fa.getRowData());
        statement.setLong(3, Long.valueOf(fa.getRowId()));
        statement.setString(4, fa.getUnit());
        statement.setString(5, fa.getCountriesName());}

    public staticvoidclose(Connection connection, Statement statement, ResultSet resultSet){
        try {if(connection != null)
                connection.close();}catch(Exception e){
            e.printStackTrace();}
        try {if(statement != null)
                statement.close();}catch(Exception e){
            e.printStackTrace();}
        try {if(resultSet != null)
                resultSet.close();}catch(Exception e){
            e.printStackTrace();}}}

工具类(序列化与反序列化)

代码如下(示例):

package util;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.LinkedHashSet;
import java.util.Set;

public class SerializableUtil{
    public staticvoidmain(String[] args){
        File file=new File("test.dat");
        Set<String> set=new LinkedHashSet<>();
        set.add("hello");
        SerializableUtil.serialization(file, set);
        Set<String> set1=SerializableUtil.deserialization(file,new LinkedHashSet<String>());
        System.out.println(set1);}

    public static<T>voidserialization(File file, T t){
        try {
            ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(file));
            oos.writeObject(t);
            oos.flush();
            oos.close();}catch(IOException e){// TODO Auto-generated catch block
            e.printStackTrace();}}
    
    public static<T> T deserialization(File file, T t){if(!file.exists()){return null;}
        try {
            ObjectInputStream ois = new ObjectInputStream(new FileInputStream(file));
            t =(T) ois.readObject();
            ois.close();return t;}catch(Exception e){
            e.printStackTrace();}return null;}}

工具类

代码如下(示例):

package util;

public class Utils {
    
     public staticvoidsleep(Integer time){
        try {
            Thread.sleep(time);}catch(InterruptedException e){//}}}

总结

selenium-java结合httpclient满足大部分网站爬虫代码就到这儿了

标签: java selenium

本文转载自: https://blog.csdn.net/weixin_44588176/article/details/127917281
版权归原作者 微信:13797120587 所有, 如有侵权,请联系我们删除。

“爬虫selenium-java”的评论:

还没有评论