hivesql 将json格式字符串转为数组
完整过程SQL在文末
json 格式字符串
本案例 json 字符串参考格式,请勿使用本数据
{"data":[{"province":11,"id_card":"110182198903224674","name":"闾丘饱乾"},{"province":21,"id_card":"210182198903224674","name":"贺巧"}]}
测试数据
本案例测试数据,复制保存后请勿格式化
{"data":[{"province":11,"id_card":"110182198903224674","name":"闾丘饱乾"},{"province":21,"id_card":"210182198903224674","name":"贺巧"},{"province":31,"id_card":"310182198903224674","name":"方加牡"},{"province":41,"id_card":"410182198903224674","name":"邱赣"},{"province":42,"id_card":"420182198903224674","name":"郝郑惭"},{"province":52,"id_card":"520182198903224674","name":"余烂"},{"province":62,"id_card":"620182198903224674","name":"宇文酚"},{"province":81,"id_card":"810182198903224674","name":"赖队瞻"}]}
创建测试数据库
createdatabase test;
use test;
创建数据表
本案例为数仓分层设计
- 创建ods层原始数据表
- 创建dwd层维度数据表
- ETL转换ods层数据插入到dwd层
创建ods层原始数据表
createtable people_ods(
info string
);
加载测试数据
load data local inpath "/root/people.json" overwrite into table people_ods;
创建dwd层维度数据表
createtable people_dwd(
id_card string,
name string,
province string
);
ETL转换ods层数据插入到dwd层
insert overwrite table people_dwd (select json_tuple(people,'id_card','name','province')as(id_card,name,province)from(select people from(select split(regexp_replace(regexp_replace(get_json_object(info,'$.data'),'\\[|\\]',''),'\\}\\,\\{','\\}\\;\\{'),";") people_list from people_ods) ods
lateral view explode(people_list) t1 as people) t2);
查询测试
select*from people_dwd;
ETL 解析
查询原始数据
select info from people_ods;
获取json格式数组字符串
使用 get_json_object 函数获取 data 属性
select get_json_object(info,'$.data')from people_ods;
将字符串两端的 [] 去掉
使用 regexp_replace 函数将 开头的 [ 和结尾的 ] 替换为 空字,
注意:由于hive使用java语言编写所以需要使用转义字符
select regexp_replace(get_json_object(info,'$.data'),'\\[|\\]','')from people_ods;
清洗后的格式
{"province":11,"id_card":"110182198903224674","name":"闾丘饱乾"},{"province":12,"id_card":"120182198903224674","name":"慕容芋岛"}
将字符串中 },{ 转为 };{ 对象之间使用分号间隔
json格式字符串对象分隔符和属性分隔符都使用的是逗号
使用split函数切分的时候无法区分对象和属性
故而将对象分隔符替换为分号便于split函数切分
select regexp_replace(regexp_replace(get_json_object(info,'$.data'),'\\[|\\]',''),'\\}\\,\\{','\\}\\;\\{')from people_ods;
清洗后的格式
{"province":11,"id_card":"110182198903224674","name":"闾丘饱乾"};{"province":12,"id_card":"120182198903224674","name":"慕容芋岛"}
两次清洗后结果对比
转为 字符串数组
select split(regexp_replace(regexp_replace(get_json_object(info,'$.data'),'\\[|\\]',''),'\\}\\,\\{','\\}\\;\\{'),';')from people_ods;
列转行
使用 UDTF 裂函数 将单行数据转换为多行数据
select people from(select split(regexp_replace(regexp_replace(get_json_object(info,'$.data'),'\\[|\\]',''),'\\}\\,\\{','\\}\\;\\{'),";") people_list from people_ods) ods
lateral view explode(people_list) t as people;
json 解析
使用 get_json_object 或者 json_tuple 函数 对json对象进行解析
本案例使用 json_tuple 函数
select json_tuple(people,'id_card','name','province')as(id_card,name,province)from(select people from(select split(regexp_replace(regexp_replace(get_json_object(info,'$.data'),'\\[|\\]',''),'\\}\\,\\{','\\}\\;\\{'),";") people_list from people_ods) ods
lateral view explode(people_list) t1 as people) t2;
完成过程SQL
-- 创建测试数据库createdatabase test;-- 使用测试数据库use test;-- 创建ods层原始数据表createtable people_ods(
info string
);-- 加载数据loaddatalocal inpath "/root/people.json" overwrite intotable people_ods;-- 查询ods层袁术数据select info from people_ods;-- 获取json格式数组字符串select get_json_object(info,'$.data')from people_ods;-- 将字符串两端的 [] 去掉select regexp_replace(get_json_object(info,'$.data'),'\\[|\\]','')from people_ods;-- 将字符串中 },{ 转为 };{ 对象之间使用分号间隔select regexp_replace(regexp_replace(get_json_object(info,'$.data'),'\\[|\\]',''),'\\}\\,\\{','\\}\\;\\{')from people_ods;-- 转为 字符串数组select split(regexp_replace(regexp_replace(get_json_object(info,'$.data'),'\\[|\\]',''),'\\}\\,\\{','\\}\\;\\{'),';')from people_ods;-- 列转行select people from(select split(regexp_replace(regexp_replace(get_json_object(info,'$.data'),'\\[|\\]',''),'\\}\\,\\{','\\}\\;\\{'),";") people_list from people_ods) ods
lateral view explode(people_list) t as people;-- 转json对象后解析select json_tuple(people,'id_card','name','province')as(id_card,name,province)from(select people from(select split(regexp_replace(regexp_replace(get_json_object(info,'$.data'),'\\[|\\]',''),'\\}\\,\\{','\\}\\;\\{'),";") people_list from people_ods) ods
lateral view explode(people_list) t1 as people) t2;-- 创建dwd层维度数据表createtable people_dwd(
id_card string,
name string,
province string
);-- ETL转换ods层数据插入到dwd层insert overwrite table people_dwd (select json_tuple(people,'id_card','name','province')as(id_card,name,province)from(select people from(select split(regexp_replace(regexp_replace(get_json_object(info,'$.data'),'\\[|\\]',''),'\\}\\,\\{','\\}\\;\\{'),";") people_list from people_ods) ods
lateral view explode(people_list) t1 as people) t2);select*from people_dwd;
版权归原作者 李昊哲小课 所有, 如有侵权,请联系我们删除。