使用flink 对kafka中每天的数据进行汇总,来一条数据统计一次结果,并将结果进行持久化
public static void main(String[] args) throws Exception {
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
Properties props = new Properties();
props.setProperty("group.id", "test005");
props.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.put("enable.auto.commit", false);
props.put("auto.offset.reset", "latest ");
// 定义kafka服务器地址列表,不需要指定所有的broker
props.setProperty("bootstrap.servers", "192.168.0.45:9092,192.168.0.46:9092,192.168.0.47:9092");
FlinkKafkaConsumer<String> kafkaConsumer = new FlinkKafkaConsumer<>("audit_alarm", new SimpleStringSchema(), props);
DataStream<String> stream = env.addSource(kafkaConsumer);
DataStream<DailyData> dailyData = stream.map(new MapFunction<String, DailyData>() {
@Override
public DailyData map(String value) throws Exception {
DailyData dailyData = new DailyData(day, message, timestamp);
return dailyData;
}
}).assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<DailyData>(Time.seconds(1)) {
@Override
public long extractTimestamp(DailyData element) {
return element.getTimestamp();
}
});
DataStream<AlarmGatherResponse> dailyDataCount = dailyData
.keyBy(DailyData::getKey)
.window(TumblingEventTimeWindows.of(Time.days(1)))
//.trigger(EventTimeTrigger.create()) // 每来一条数据触发一次计算
.trigger(CountTrigger.of(1))
//.trigger(ContinuousProcessingTimeTrigger.of(Time.minutes(10)))
.process(new DailyDataCountProcessProcess());
dailyDataCount.addSink(new MySQLSink(url, username, password));
env.execute("Daily Data Count");
}
之前一直在
trigger中想用时间进行触发,后面发现当环境中使用的是
事件时间(EventTime)
事件时间(EventTime)语义,因此触发器应该使用
EventTimeTrigger
,而不是
ProcessingTimeTrigger
。
因此,将
.trigger(ProcessingTimeTrigger.create())
更改为
.trigger(EventTimeTrigger.create())
可以使触发器生效。
但是,这样的话,窗口将不会每来一条数据触发一次计算,而是要等到该窗口的 Watermark 推进到窗口结束时间后才会触发计算。如果要每来一条数据就立即计算,可以考虑使用
CountTrigger
触发器,例如
.trigger(CountTrigger.of(1))
。这将在每接收到一条数据时触发计算。注意,这样会大幅增加计算和数据传输的开销。
还有种触发条件可根据时间
trigger(ContinuousProcessingTimeTrigger.of(Time.minutes(10)))
设置多久触发一次算子
补充:
MySQLSink为将数据写入到mysql,进行了更新和新增操作,根据生成的key,对数据先进行修改动作,若修改失败,则进行新增,保证一天内,聚合的数据的唯一性
package utils;
import cn.hutool.json.JSONUtil;
import mode.AlarmGatherResponse;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
import java.sql.Connection;
import java.sql.Date;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
public class MySQLSink extends RichSinkFunction<AlarmGatherResponse> {
private Connection connection;
private PreparedStatement insertStatement;
private PreparedStatement updateStatement;
private String jdbcUrl;
private String username;
private String password;
public MySQLSink(String jdbcUrl, String username, String password) {
this.jdbcUrl = jdbcUrl;
this.username = username;
this.password = password;
}
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
connection = DriverManager.getConnection(jdbcUrl, username, password);
// 创建插入和更新的 PreparedStatement
String insertSql = "insert into t_flink_dailyDataCount(uuid," +
"start_time," +
"end_time," +
"src_list," +
"dst_list," +
"attacker_list," +
"attacked_list," +
"alarm_type," +
"priority," +
"attack_direct," +
"attack_chain," +
"update_time," +
"related_uuid_list,group_key ) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?)";
String updateSql = "UPDATE t_flink_dailyDataCount " +
"SET end_time = ?," +
"src_list = ?," +
"dst_list = ?," +
"attacker_list = ?," +
"attacked_list = ?," +
"alarm_type = ?," +
"priority = ?," +
"attack_direct = ?," +
"attack_chain = ?," +
"update_time = ? " +
" WHERE group_key = ?";
insertStatement = connection.prepareStatement(insertSql);
updateStatement = connection.prepareStatement(updateSql);
}
@Override
public void close() throws Exception {
super.close();
// 关闭连接和 PreparedStatement
if (insertStatement != null) {
insertStatement.close();
}
if (updateStatement != null) {
updateStatement.close();
}
if (connection != null) {
connection.close();
}
}
@Override
public void invoke(AlarmGatherResponse row, Context context) throws Exception {
// 先尝试更新数据
updateStatement.setDate(1, row.getEndTime());
updateStatement.setString(2, JSONUtil.toJsonStr(row.getSrcList()));
updateStatement.setString(3, JSONUtil.toJsonStr(row.getDstList()));
updateStatement.setString(4, JSONUtil.toJsonStr(row.getAttackerList()));
updateStatement.setString(5, JSONUtil.toJsonStr(row.getAttackedList()));
updateStatement.setString(6, row.getAlarmType());
updateStatement.setString(7, row.getPriority());
updateStatement.setString(8, row.getAttackDirect());
updateStatement.setInt(9, row.getAttackChain()==null?0:row.getAttackChain());
updateStatement.setTimestamp(10, new java.sql.Timestamp(System.currentTimeMillis()));
updateStatement.setString(11, row.getKey());
int updatedRows = updateStatement.executeUpdate();
if (updatedRows == 0) {
// 如果更新失败,则插入数据
insertStatement.setString(1, row.getUuid());
insertStatement.setDate(2, row.getStartTime());
insertStatement.setDate(3, row.getEndTime());
insertStatement.setString(4, JSONUtil.toJsonStr(row.getSrcList()));
insertStatement.setString(5, JSONUtil.toJsonStr(row.getDstList()));
insertStatement.setString(6, JSONUtil.toJsonStr(row.getAttackerList()));
insertStatement.setString(7, JSONUtil.toJsonStr(row.getAttackedList()));
insertStatement.setString(8, row.getAlarmType());
insertStatement.setString(9, row.getPriority());
insertStatement.setString(10, row.getAttackDirect());
insertStatement.setInt(11, row.getAttackChain()==null?0:row.getAttackChain());
insertStatement.setDate(12, new Date(System.currentTimeMillis()));
insertStatement.setString(13, JSONUtil.toJsonStr(row.getRelatedUuidList()));
insertStatement.setString(14, row.getKey());
insertStatement.executeUpdate();
}
}
}
版权归原作者 m0_37687896 所有, 如有侵权,请联系我们删除。