爬虫java的实现
文章目录
前言
1 selenium-java+httpclient实现爬取页面,并且通过jdbc批量插入mysql
2 可解决开启请求监控,自动获取token,ajax数据加密返回,无法直接拿数据等问题
3 chromedriver的使用自行百度(
如果步骤全对,还报错,请用管理员权限运行你开发工具
)
4 注意:以下代码为demo,需自己根据实际业务修改
一、selenium-java是什么?
示例:selenium-java
二、使用步骤
爬虫目录结构
引入库
mavne依赖:
<dependency><groupId>org.seleniumhq.selenium</groupId><artifactId>selenium-java</artifactId><version>4.5.3</version></dependency><dependency><groupId>commons-io</groupId><artifactId>commons-io</artifactId><version>2.11.0</version></dependency><dependency><groupId>org.apache.httpcomponents</groupId><artifactId>httpclient</artifactId><version>4.5.13</version></dependency><dependency><groupId>org.projectlombok</groupId><artifactId>lombok</artifactId><version>1.18.22</version><scope>provided</scope></dependency><dependency><groupId>com.fasterxml.jackson.core</groupId><artifactId>jackson-databind</artifactId><version>2.11.1</version></dependency><dependency><groupId>com.fasterxml.jackson.core</groupId><artifactId>jackson-core</artifactId><version>2.11.1</version></dependency><dependency><groupId>com.fasterxml.jackson.core</groupId><artifactId>jackson-annotations</artifactId><version>2.11.1</version></dependency><dependency><groupId>mysql</groupId><artifactId>mysql-connector-java</artifactId><version>8.0.23</version></dependency>
主方法代码
代码如下(示例):
package test;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.io.FileUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.devtools.DevTools;
import org.openqa.selenium.devtools.v106.network.Network;
import org.openqa.selenium.devtools.v106.network.model.Headers;
import org.openqa.selenium.devtools.v106.network.model.ResourceType;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import entity.Fa;
import util.MyHttpUtil;
import util.MySqlStrategy;
import util.SerializableUtil;
import util.Utils;/**
*
*
* @author admin
*
*/
public class CrawlerTest {
private static String token ="xxxx";
final static String driverAddr ="C:\\Users\\admin\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe";//登录
final static String url1 ="https://xxxxx/system/login?";//获取详情接口
final static String querySaasUrlTemplate ="https://xxxxxx?id=#{id}";
final static String url2 ="https://xxxxxx?";
final static String userName ="uername";
final static String passWord ="password";
final static File idCacheFile = new File("id.bat");//搜索名称持久化文件
final static File searchNameFile = new File("searchName.bat");
final static Set<String> idSet=getCacheSet(idCacheFile);
final static Set<String> searchNameSet=getCacheSet(searchNameFile);
public staticvoidmain(String[] args){
System.setProperty("webdriver.chrome.driver", driverAddr);// 设置浏览器options
ChromeOptions options = new ChromeOptions();// 关闭界面上的---Chrome正在受到自动软件的控制
options.setExperimentalOption("excludeSwitches", new String[]{"enable-automation"});
ChromeDriver driver = new ChromeDriver(options);
Map<String, Object> command = new HashMap<>();// window.navigator.webdirver
command.put("source","Object.defineProperty(navigator, 'webdriver', {get: () => undefined})");
driver.executeCdpCommand("Page.addScriptToEvaluateOnNewDocument", command);// driver.executeScript("https://raw.githubusercontent.com/wendux/Ajax-hook/master/dist/ajaxhook.min.js");// driver.get("htps://www.baidu.com");// 首先登录
driver.get(url1);
driver.manage().window().maximize();
Utils.sleep(5000);// 设置用户名
driver.findElement(By.xpath("//*[@id=\"phone_number\"]")).sendKeys(userName);
Utils.sleep(1000);// 设置密码
driver.findElement(By.xpath("//*[@id=\"password\"]")).sendKeys(passWord);
Utils.sleep(1000);// 勾选同意
driver.findElement(By.xpath("//*[@id=\"agreement\"]")).click();
Utils.sleep(1000);// 登录
driver.findElement(
By.xpath("//*[@id=\"root\"]/div/div[2]/div[1]/div[2]/div/div/form/div[4]/div/div/div/button")).click();// 获取window窗口句柄
String handel1 = driver.getWindowHandle();
Utils.sleep(1000);
System.out.println("登录成功");
Utils.sleep(3000);
driver.get(url1);
Utils.sleep(3000);// 打开一个新窗口
String js ="window.open(\""+ url2 +"\");";((JavascriptExecutor) driver).executeScript(js);
Utils.sleep(2000);// 切换窗口
Object[] obj = driver.getWindowHandles().toArray();// 监听数据(下标为1的窗口)createRequestListener(1, driver);
driver.switchTo().window(obj[1].toString());
Utils.sleep(1000);//
String searchName="搜索名称";//已经爬取过,不在获取if(searchNameSet.contains(searchName)){
System.out.println(searchName+":已经处理过");return;}
driver.findElement(By.xpath("//*[@id=\"name\"]")).sendKeys(searchName);// 查询
driver.findElement(By.xpath("//*[@id=\"root\"]/section/section/div[2]/div/div[1]/div/form/div[6]/button")).click();
Utils.sleep(2000);
WebElement webElement = null;
try {// 通过是否有下一页按钮,判断是否有数据(没有数据,这一行会抛出异常退出)
webElement = driver.findElement(By
.xpath("//*[@id=\"root\"]/section/section/div[2]/div/div[3]/div/div/div/div/div/ul/li[5]/button"));}catch(Exception exception){// 跳出循环
System.out.println("没有数据");}// 一个字处理完所有数据插入数据库
List<Fa> faList = new ArrayList<>();// 为了防止死循环,最多1000次for(int i =0; i <1000; i++){// 第一次数据不点击if(i !=0){// 分页处理----// 判断是否有可以点击
Boolean isEnabled = webElement.isEnabled();if(isEnabled){// 可以点击
webElement.click();// 点击完休眠等待
Utils.sleep(2000);}else{// 不可以点击说明下一页处理完毕break;}// 每次点击后休眠2秒,取数据}// 说明有数据,直接获取
WebElement tableWebElement = driver.findElement(By.xpath("//*[@id=\"root\"]/section/section/div[2]/div/div[3]/div/div/div/div/div/div/div/div/table/tbody"));
List<WebElement> trList = tableWebElement.findElements(By.tagName("tr"));
System.out.println("");
System.out.println("当前数据页数:"+(i +1));for(WebElement element : trList){
Utils.sleep(500);// System.out.println(element.getText().replace(" ", ""));// 获取详情数据按钮// WebElement// detailElement=element.findElement(By.xpath("//*[@id=\"root\"]/section/section/div[2]/div/div[3]/div/div/div/div/div/div/div/div/table/tbody/tr[1]/td[9]/div/span[1]"));// detailElement.click();// 等待获取json数据完成//判断该条数据是否已经完成
String detailId = element.getAttribute("data-row-key");if(idSet.contains(detailId)){//该条数据已经处理continue;}// 单位
String unit = element.findElement(By.xpath("//td[5]")).getText().replace(" ","");// 国家名称
String countriesName = element.findElement(By.xpath("//td[7]")).getText().replace(" ","");// 通过获取的id发送http请求
String querySaasUrl = querySaasUrlTemplate.replace("#{id}", detailId);
String result = MyHttpUtil.getRequest(token, querySaasUrl);// json解析数据
ObjectMapper mapper = new ObjectMapper();// 定义一个转化对象
try {
JsonNode jsonNode = mapper.readTree(result);if("200".equals(String.valueOf(jsonNode.get("code")))){
JsonNode dataNode = jsonNode.get("data");
System.out.println(dataNode);
Fa fa = mapper.readValue(dataNode.toString(), Fa.class);
fa.setUnit(unit);
fa.setCountriesName(countriesName);// 筛入ajax返回的所有数据
fa.setRowData(dataNode.toString());
faList.add(fa);}else{
System.out.println("获取json数据失败");
System.out.println(jsonNode.toPrettyString());
System.exit(0);}}catch(Exception e){
System.out.print("数据解析异常:");
e.printStackTrace();// 退出
System.exit(0);}}// System.out.println(tableWebElement.getText());}// 插入数据到mysqlif(!faList.isEmpty()){
MySqlStrategy.insertValue(faList);}//将本次跑的参数缓存
searchNameSet.add(searchName);for(Fa factory:faList){
idSet.add(factory.getRowId());}//序列化
SerializableUtil.serialization(searchNameFile,searchNameSet);
SerializableUtil.serialization(idCacheFile, idSet);//
try {
Thread.currentThread().join();}catch(InterruptedException e){
e.printStackTrace();}}/**
* 切换多个窗口需要多个监听
*
* @param i 窗口下标(只区哪个窗口监控的数据,无实际意义)
* @param driver
*/
private staticvoidcreateRequestListener(int i, ChromeDriver driver){
DevTools devTools = driver.getDevTools();
devTools.createSession();
devTools.send(
Network.enable(java.util.Optional.empty(), java.util.Optional.empty(), java.util.Optional.empty()));
devTools.addListener(Network.requestWillBeSent(), res ->{
Utils.sleep(10);
System.out.println("RequestHeaders:"+ res.getRequest().getHeaders());
System.out.println("RequestHeaders:"+ res.getRequest().getUrl());
Headers header = res.getRequest().getHeaders();synchronized(CrawlerTest.class){if(header.containsKey("Authorization")){
token =(String) header.get("Authorization");// 获取token后销毁改监视器
devTools.close();
System.out.println("获取到了token:"+ token);}}});}/**
* 根据url获取ajax数据
*
* @param pattern
* @param callback
*/
public staticvoidinterceptResponseXHRByUrl(int i, DevTools devTools){
devTools.addListener(Network.responseReceived(), responseReceived ->{
try {if(ResourceType.XHR != responseReceived.getType()||!responseReceived.getResponse().getUrl().contains("/xxxxxx")){return;}// 取类型为XHR
String data ="监控数据"+ i +":"+ responseReceived.getType()+":"+ responseReceived.getResponse().getUrl();
Utils.sleep(2);
FileUtils.write(new File("log/re.txt"), data,"UTF-8", true);
FileUtils.write(new File("log/re.txt"),"\r\n","UTF-8", true);
devTools.send(Network.getResponseBody(responseReceived.getRequestId()));}catch(Exception e){
e.printStackTrace();} finally {}});}/**
* 创建一个set集合
* @return
*/
private static Set<String>getCacheSet(File file){//
Set<String> set=new LinkedHashSet<>();//反序列化值
Set<String> cacheSet=SerializableUtil.deserialization(file, set);if(cacheSet!=null){
set=cacheSet;}return set;}}
封装数据实体类
代码如下(示例):
package entity;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.annotation.JsonProperty;
import lombok.Data;
@Data
@JsonIgnoreProperties(ignoreUnknown = true)
public class Fa{
private String rowData;
@JsonProperty("id")
private String rowId;
private String unit;
private String countriesName;
private List<FaDetail> detailData;}
封装数据实体类
代码如下(示例):
package entity;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import lombok.Data;
@Data
@JsonIgnoreProperties(ignoreUnknown = true)
public class FaDetail {
private Long faId;
private String type;}
工具类(Config)
代码如下(示例):
package util;
public class Config {//驱动,8.0固定为该格式
public static final String JDBC_DRIVER ="com.mysql.cj.jdbc.Driver";//数据库地址,修改该数据库名称
public static final String DB_URL ="jdbc:mysql://192.168.111.102:3306/crawler?useSSL=false&allowPublicKeyRetrieval=true&serverTimezone=Asia/Shanghai";//用户名
public static final String USER ="root";//密码
public static final String PASSWORD ="Sailing123`";}
工具类(MyHttpUtil)
代码如下(示例):
package util;
import org.apache.http.ParseException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class MyHttpUtil {
private static final String token="a7ee88f8-21d6-4b1d-bfa8-ff478a473304+1000001239406480";
private static final String url="https://xxxxxx?id=xxxxxx";
private static final CloseableHttpClient closeableHttpClient = HttpClients.createDefault();
public staticvoidmain(String[] args){getRequest(token,url);}
public static String getRequest(String token,String url){
HttpGet httpGet=new HttpGet(url.toString());
httpGet.setHeader("authorization", token);
try {
CloseableHttpResponse closeableHttpResponse = closeableHttpClient.execute(httpGet);
String responseString= EntityUtils.toString(closeableHttpResponse.getEntity());return responseString;}catch(ParseException | IOException e){
e.printStackTrace();
System.out.println("请求数据出错,请排查问题");
System.exit(1);}finally {//将连接放回连接池中(下次重新使用)
httpGet.releaseConnection();}return null;}}
MySqlStrategy
代码如下(示例):
package util;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
import entity.Fa;
import entity.FaDetail;
public class MySqlStrategy {
private final static String url = Config.DB_URL;
private final static String user = Config.USER;
private final static String password = Config.PASSWORD;
private static Connection conn =getConnection();// ALTER TABLE factor AUTO_INCREMENT=1;
public staticvoidmain(String[] args){insertValue(null);}
private static Connection getConnection(){
try {
conn = DriverManager.getConnection(url, user, password);}catch(SQLException e){
e.printStackTrace();}return conn;}
public staticvoidinsertValue(List<Fa> datalist){
String sql ="insert into fa values(?,?,?,?,?)";
String gasSql ="insert into fa_detail values(?,?,?)";
try {
conn.setAutoCommit(false);}catch(SQLException e2){
e2.printStackTrace();}try(PreparedStatement statement = conn.prepareStatement(sql, PreparedStatement.RETURN_GENERATED_KEYS);
PreparedStatement detailStatement = conn.prepareStatement(gasSql,
PreparedStatement.RETURN_GENERATED_KEYS)){for(int i =0; i < datalist.size(); i++){
Fa fa = datalist.get(i);creatFaParam(fa, statement);
statement.addBatch();}
statement.executeBatch();
ResultSet generatedKeys = statement.getGeneratedKeys();
List<Long> idList = new ArrayList<>();while(generatedKeys.next()){
idList.add(generatedKeys.getLong(1));}//关闭该结果集close(null,null,generatedKeys);// 给子表插入主表idfor(int i =0; i < datalist.size(); i++){
Fa factory = datalist.get(i);
List<FaDetail> detailList = factory.getDetailData();if(detailList != null){for(FaDetail gas : detailList){
gas.setFaId(idList.get(i));// 准备批量数据creatFaDetailParam(detailStatement, gas);
detailStatement.addBatch();}}}// 对子表进行批量插入
detailStatement.executeBatch();
conn.commit();}catch(Exception e1){//回滚
try {
conn.rollback();}catch(SQLException e){}//说明有重复的key,直接返回if(e1.getMessage().contains("Duplicate entry")){return;}else{//退出程序,排查问题
e1.printStackTrace();
System.exit(1);}}}
private staticvoidcreatFaDetailParam(PreparedStatement statement, FaDetail detail) throws SQLException {
statement.setString(1, null);
statement.setLong(2, detail.getFaId());
statement.setString(3, detail.getType());}
private staticvoidcreatFaParam(Fa fa, PreparedStatement statement) throws SQLException {
statement.setString(1, null);
statement.setString(2, fa.getRowData());
statement.setLong(3, Long.valueOf(fa.getRowId()));
statement.setString(4, fa.getUnit());
statement.setString(5, fa.getCountriesName());}
public staticvoidclose(Connection connection, Statement statement, ResultSet resultSet){
try {if(connection != null)
connection.close();}catch(Exception e){
e.printStackTrace();}
try {if(statement != null)
statement.close();}catch(Exception e){
e.printStackTrace();}
try {if(resultSet != null)
resultSet.close();}catch(Exception e){
e.printStackTrace();}}}
工具类(序列化与反序列化)
代码如下(示例):
package util;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.LinkedHashSet;
import java.util.Set;
public class SerializableUtil{
public staticvoidmain(String[] args){
File file=new File("test.dat");
Set<String> set=new LinkedHashSet<>();
set.add("hello");
SerializableUtil.serialization(file, set);
Set<String> set1=SerializableUtil.deserialization(file,new LinkedHashSet<String>());
System.out.println(set1);}
public static<T>voidserialization(File file, T t){
try {
ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(file));
oos.writeObject(t);
oos.flush();
oos.close();}catch(IOException e){// TODO Auto-generated catch block
e.printStackTrace();}}
public static<T> T deserialization(File file, T t){if(!file.exists()){return null;}
try {
ObjectInputStream ois = new ObjectInputStream(new FileInputStream(file));
t =(T) ois.readObject();
ois.close();return t;}catch(Exception e){
e.printStackTrace();}return null;}}
工具类
代码如下(示例):
package util;
public class Utils {
public staticvoidsleep(Integer time){
try {
Thread.sleep(time);}catch(InterruptedException e){//}}}
总结
selenium-java结合httpclient满足大部分网站爬虫代码就到这儿了
版权归原作者 微信:13797120587 所有, 如有侵权,请联系我们删除。