一、数据炸裂
0 问题描述
如何将字符串1-5,16,11-13,9" 扩展成 "1,2,3,4,5,16,11,12,13,9" 且顺序不变。
1 数据准备
with data as (select '1-5,16,11-13,9' as a)
2 数据分析
** 步骤一**:explode(split(a, ',')) 炸裂 + row_number()排序,一行变多行,且对每行的数据排序,保证有序性。
with data as (select '1-5,16,11-13,9' as a)
select
a1,
row_number() over () as rn
from (
select
explode(split(a, ',')) as a1
from data
) tmp1;
输出结果:
步骤二: lateral view explode(split(a, '-')) 、max(b) - min(b) as diff
(1)lateral view +explode 侧写和炸裂,一行变多行,并将源表中每行的输出结果与该行连接;
(2)group by a1, rn ....... select min(a2) as start_data得到每个分组的起始值
(3)max(a2) - min(a1) 得到每个分组的步长
with data as (select '1-5,16,11-13,9' as a)
select
a1,
rn,
cast(min(a2) as int) as start_data,
cast(max(a2) - min(a2) as int) as diff
from (
select
a1,
a2,
rn
from (
select
a1,
row_number() over () as rn
from (
select
explode(split(a, ',')) as a1
from data
) tmp1
) tmp2
lateral view explode(split(a1, '-')) table1 as a2
) tmp3
group by a1, rn;
输出结果是:
步骤三: 根据步长生成索引值,起始值加上索引值获取展开值
侧写和炸裂,根据分组的步长 diff 生成对应的索引值pos
(1)lateral view posexplode(split(repeat(',', diff), ',')) table2 as pos, item;
该代码等价于: lateral view posexplode(split(space(diff), '')) table2 as pos, item;
(2)(start_data + pos) as end_data,起始值加上索引值获取展开值
with data as (select '1-5,16,11-13,9' as a)
select
a1,
rn,
start_data,
diff,
(start_data + pos) as end_data
from (
select
a1,
rn,
cast(min(a2) as int) as start_data,
cast(max(a2) - min(a2) as int) as diff
from (
select
a1,
a2,
rn
from (
select
a1,
row_number() over () as rn
from (
select
explode(split(a, ',')) as a1
from data
) tmp1
) tmp2
lateral view explode(split(a1, '-')) table1 as a2
) tmp3
group by a1, rn
) tmp4
lateral view posexplode(split(repeat(',', diff), ',')) table2 as pos, item
输出结果是:
步骤四(关键的一步) : row_number() over (order by rn,end_data) as rn1 需要进行全局排序,不然最后collect_list聚合的时候,数字顺序是混乱的。
with data as (select '1-5,16,11-13,9' as a)
select
a1,
rn,
end_data,
row_number() over (order by rn,end_data) as rn1
from (
select
a1,
rn,
start_data,
diff,
(start_data + pos) as end_data
from (
select
a1,
rn,
cast(min(a2) as int) as start_data,
cast(max(a2) - min(a2) as int) as diff
from (
select
a1,
a2,
rn
from (
select
a1,
row_number() over () as rn
from (
select
explode(split(a, ',')) as a1
from data
) tmp1
) tmp2
lateral view explode(split(a1, '-')) table1 as a2
) tmp3
group by a1, rn
) tmp4
lateral view posexplode(split(repeat(',', diff), ',')) table2 as pos, item
) tmp5
order by rn
步骤五 : concat_ws(',', collect_list(cast(end_data as string)))
对end_data 进行聚合collect_list,再利用concat_ws函数拼接字符串
with data as (select '1-5,16,11-13,9' as a)
select
concat_ws(',', collect_list(cast(end_data as string))) as result
from (
select
a1,
rn,
end_data,
row_number() over (order by rn,end_data) as rn1
from (
select
a1,
rn,
start_data,
diff,
(start_data + pos) as end_data
from (
select
a1,
rn,
cast(min(a2) as int) as start_data,
cast(max(a2) - min(a2) as int) as diff
from (
select
a1,
a2,
rn
from (
select
a1,
row_number() over () as rn
from (
select
explode(split(a, ',')) as a1
from data
) tmp1
) tmp2
lateral view explode(split(a1, '-')) table1 as a2
) tmp3
group by a1, rn
) tmp4
lateral view posexplode(split(repeat(',', diff), ',')) table2 as pos, item
) tmp5
order by rn
) tmp6
3 小结
数据炸裂的思路一般是:
1.计算区间【a,b】的步长(差值)diff;
2.利用split分割函数+ posexplode等 将一行变成 diff+1 行,生成对应的下角标pos(pos的取值为【0,diff】);
3.【a,b】区间的起始值 (a + pos) 将数据平铺开;
4.基于平铺开后的数据集进一步加工处理,例如:分组聚合等。
上述题目需要注意:collect_list()的底层是ArrayList 集合来实现的,HiveSQL执行时,底层会转换成MR任务,当同时开启多个mapper任务时,无法知道是哪个mapper先计算完,所以可能会出现ArrayList集合中的数据顺序与原来数据插入的顺序不对齐的情况。
关于collect_set() /collect_list() 函数的有序性,相关文章指路:
HiveSQL题——collect_set()/collect_list()聚合函数-CSDN博客HiveSQL题——collect_set()/collect_list()聚合函数https://blog.csdn.net/SHWAITME/article/details/136011647?spm=1001.2014.3001.5502
二、数据合并
0 问题描述
面试题:基于A表的数据生成B表数据
1 数据准备
create table if not exists tableA
(
id string comment '用户id',
name string comment '用户姓名'
) comment 'A表';
insert overwrite table tableA values
('1','aa'),
('2','aa'),
('3','aa'),
('4','d'),
('5','c'),
('6','aa'),
('7','aa'),
('8','e'),
('9','f'),
('10','g');
create table if not exists tableC
(
id string comment '用户id',
name string comment '用户姓名'
) comment 'C表';
insert overwrite table tableC values
('3','aa|aa|aa'),
('4','d'),
('5','c'),
('7','aa|aa'),
('8','e'),
('9','f'),
('10','g');
2 数据分析
步骤1:寻找满足条件的断点
select
id,
name,
if(name != lag_name, 1, 0) as flag
from (
select
id,
name,
lag(name, 1, name) over (order by cast(id as int)) as lag_name
from tableA
) tmp1;
输出结果为:
步骤2:断点处标记为1,非断点处标记为0,并对断点标记值进行累加,构造分组标签
select
id,
name,
--并对断点标记值进行累加,构造分组标签
sum(flag) over (order by cast(id as int)) grp
from (
select
id,
name,
--断点处标记为1,非断点处标记为0
if(name != lag_name, 1, 0) flag
from (
select
id,
name,
lag(name, 1, name) over (order by cast(id as int)) as lag_name
from tableA
) tmp1
) tmp2;
输出结果为:
步骤3:按照分组标签进行数据合并,并取得分组中最大值作为id
select
max_id,
-- collect_list 数据聚合并拼接concat_ws
concat_ws('|', collect_list(name)) as name
from (
select
name,
grp,
max(id) over (partition by grp) max_id
from (
select
id,
name,
sum(if(name != lag_name, 1, 0)) over (order by cast(id as int)) as grp
from (
select
id,
name,
lag(name, 1, name) over (order by cast(id as int)) as lag_name
from tableA
) tmp1
) tmp2
) tmp3
group by max_id, grp;
输出结果为:
通过max_id, grp分组,对name进行 concat_ws('|', collect_list(name)) 聚合拼接,得出最终的结果
3 小结
断点分组问题的算法总结
步骤1:寻找满足条件的断点
步骤2:断点处标记值为1,非断点处标记为0
步骤3:对断点标记值进行累加 sum(xx)over(order by xx),构造分组标签
步骤4:按照分组标签进行分组求解问题
版权归原作者 爱吃辣条byte 所有, 如有侵权,请联系我们删除。