数据背景
hive (ods)> select * from jsont1; OK jsont1.username jsont1.age jsont1.sex jsont1.json user1 18 male {"id":1,"ids":[101,102,103],"total_number":3} user2 28 female {"id":2,"ids":[201,202,203],"total_number":3} user3 38 male {"id":3,"ids":[301,302,303,304],"total_number":4} user4 48 female {"id":4,"ids":[401,402,403,404,405],"total_number":5} user5 58 male {"id":5,"ids":[501,502,503],"total_number":3} user6 68 female {"id":6,"ids":[601,602,603],"total_number":3} user7 78 male {"id":7,"ids":[701,702,703],"total_number":3} NULL NULL NULL Time taken: 0.933 seconds, Fetched: 8 row(s)一.使用hive内置函数处理 1:通过get_json_object()单个取
hive (ods)> select username,age,sex,get_json_object(json,"$.id") id ,get_json_object(json,"$.ids") ids ,get_json_object(json,"$.total_number") totalNumber > from jsont1; OK username age sex id ids totalnumber user1 18 male 1 [101,102,103] 3 user2 28 female 2 [201,202,203] 3 user3 38 male 3 [301,302,303,304] 4 user4 48 female 4 [401,402,403,404,405] 5 user5 58 male 5 [501,502,503] 3 user6 68 female 6 [601,602,603] 3 user7 78 male 7 [701,702,703] 3 NULL NULL NULL NULL NULL Time taken: 0.562 seconds, Fetched: 8 row(s)
还可以取数组中的某个元素
hive (ods)> select username,age,sex,get_json_object(json,"$.id") id , > get_json_object(json,"$.ids[0]") ids0, > get_json_object(json,"$.ids[1]") ids1, > get_json_object(json,"$.ids[2]") ids2, > get_json_object(json,"$.ids[3]") ids3, > get_json_object(json,"$.ids[4]") ids4, > get_json_object(json,"$.ids[5]") ids5, > get_json_object(json,"$.total_number") totalNumber > from jsont1; OK username age sex id ids0 ids1 ids2 ids3 ids4 ids5 totalnumber user1 18 male 1 101 102 103 NULL NULL NULL 3 user2 28 female 2 201 202 203 NULL NULL NULL 3 user3 38 male 3 301 302 303 304 NULL NULL 4 user4 48 female 4 401 402 403 404 405 NULL 5 user5 58 male 5 501 502 503 NULL NULL NULL 3 user6 68 female 6 601 602 603 NULL NULL NULL 3 user7 78 male 7 701 702 703 NULL NULL NULL 3 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL Time taken: 6.033 seconds, Fetched: 8 row(s) hive (ods)>2.通过hive内置函数json_tuple()一起取
hive (ods)> select json_tuple(json,'id','ids','total_number') > from jsont1; OK c0 c1 c2 1 [101,102,103] 3 2 [201,202,203] 3 3 [301,302,303,304] 4 4 [401,402,403,404,405] 5 5 [501,502,503] 3 6 [601,602,603] 3 7 [701,702,703] 3 NULL NULL NULL Time taken: 1.076 seconds, Fetched: 8 row(s)
以上处理方式缺点:
(1).通过hive内置函数解析出的json都为json字符串并非我们想要的字符数组.
我们仔细观察它们俩者之间的不同
"[101,102,103]"-->["101","102","103"] 如果我们要通过hql处理如下: hive (ods)> select regexp_replace("[101,102,103]","\[|\]",""); OK _c0 101,102,103 Time taken: 0.237 seconds, Fetched: 1 row(s) hive (ods)> select split(regexp_replace("[101,102,103]","\[|\]",""),","); OK _c0 ["101","102","103"]
(2)通过json_tuple()进行都分个取出,但是发现不能和原字段并列显示
hive (ods)> select username,age,sex,json_tuple(json,'id','ids','total_number') > from jsont1; FAILED: SemanticException [Error 10081]: UDTF's are not supported outside the SELECt clause, nor nested in expressions
ps:报错:我自己猜测是hive中没有json数组类型
补救措施 lateral view视图
通过下列方式可以(lateral view 拼接视图)
hive (ods)> select username,age,sex,id,ids,num > from jsont1 > lateral view json_tuple(json,'id','ids','total_number') t1 as id,ids,num; OK username age sex id ids num user1 18 male 1 [101,102,103] 3 user2 28 female 2 [201,202,203] 3 user3 38 male 3 [301,302,303,304] 4 user4 48 female 4 [401,402,403,404,405] 5 user5 58 male 5 [501,502,503] 3 user6 68 female 6 [601,602,603] 3 user7 78 male 7 [701,702,703] 3 NULL NULL NULL NULL NULL Time taken: 0.64 seconds, Fetched: 8 row(s)
(3)如果我们需要对字符数组炸裂做进一步处理更加繁琐
hive (ods)> with tmp as (select username,age,sex,id,ids,num > from jsont1 > lateral view json_tuple(json,'id','ids','total_number') t1 as id,ids,num) > select username,age,sex,id,ids1,num > from tmp > lateral view explode(split(regexp_replace(ids,"\[|\]",""),",")) t1 as ids1; OK username age sex id ids1 num user1 18 male 1 101 3 user1 18 male 1 102 3 user1 18 male 1 103 3 user2 28 female 2 201 3 user2 28 female 2 202 3 user2 28 female 2 203 3 user3 38 male 3 301 4 user3 38 male 3 302 4 user3 38 male 3 303 4 user3 38 male 3 304 4 user4 48 female 4 401 5 user4 48 female 4 402 5 user4 48 female 4 403 5 user4 48 female 4 404 5 user4 48 female 4 405 5 user5 58 male 5 501 3 user5 58 male 5 502 3 user5 58 male 5 503 3 user6 68 female 6 601 3 user6 68 female 6 602 3 user6 68 female 6 603 3 user7 78 male 7 701 3 user7 78 male 7 702 3 user7 78 male 7 703 3 Time taken: 0.459 seconds, Fetched: 24 row(s) hive (ods)>
(4)只能处理简单的json,复杂的json不好处理
二.使用自定义UDF函数处理json数据 1.引入依赖2.自定义UDForg.apache.hive hive-exec2.3.7
public class ParseJsonArray extends UDF { public ArrayList3.打包使用自定义UDF(我这里是定义临时UDF)evaluate(String jsonStr, String key) { if (Strings.isNullOrEmpty(jsonStr)) { return null; } try { JSONObject jsonObject = JSON.parseObject(jsonStr); JSONArray jsonArray = jsonObject.getJSONArray(key); ArrayList strings = new ArrayList<>(); for (Object o : jsonArray) { strings.add(o.toString()); } return strings; }catch (JsonException e){ return null; } } @Test public void junitParseJsonArray(){ String jsonStr = "{"id":1,"ids":[101,102,103],"total_number":3}"; String key = "ids"; ArrayList list = evaluate(jsonStr, key); System.out.println(JSON.toJSONString(list));//["101","102","103"] } }
hive (ods)> add jar /root/data/tmp_jar/FlumeInterceptor-1.0-SNAPSHOT.jar ; Added [/root/data/tmp_jar/FlumeInterceptor-1.0-SNAPSHOT.jar] to class path Added resources: [/root/data/tmp_jar/FlumeInterceptor-1.0-SNAPSHOT.jar] hive (ods)> create temporary function json_array_me as "com.lagou.dw.flume.interceptor.hive.udf.ParseJsonArray"; OK Time taken: 2.046 seconds hive (ods)>4.使用自定义UDF
hive (ods)> select username,age,sex,json_array_me(json,“ids”) ids
> from jsont1; OK username age sex ids user1 18 male ["101","102","103"] user2 28 female ["201","202","203"] user3 38 male ["301","302","303","304"] user4 48 female ["401","402","403","404","405"] user5 58 male ["501","502","503"] user6 68 female ["601","602","603"] user7 78 male ["701","702","703"] NULL NULL NULL Time taken: 0.748 seconds, Fetched: 8 row(s)
突然发现自定义的好处:不用繁琐解决解析json字符串和普通字段不能同时显示问题.
同时我们对解析后字符数据炸裂更加方便
hive (ods)> select username,age,sex,ids1 > from jsont1 > lateral view explode(json_array_me(json,"ids")) t1 as ids1; OK username age sex ids1 user1 18 male 101 user1 18 male 102 user1 18 male 103 user2 28 female 201 user2 28 female 202 user2 28 female 203 user3 38 male 301 user3 38 male 302 user3 38 male 303 user3 38 male 304 user4 48 female 401 user4 48 female 402 user4 48 female 403 user4 48 female 404 user4 48 female 405 user5 58 male 501 user5 58 male 502 user5 58 male 503 user6 68 female 601 user6 68 female 602 user6 68 female 603 user7 78 male 701 user7 78 male 702 user7 78 male 703 Time taken: 1.503 seconds, Fetched: 24 row(s)
拓展:我们如果需要将json都展开呢?
解决办法: 使用两次lateral view处理
hive (ods)> select username,age,sex,id,num,ids1 > from jsont1 > lateral view json_tuple(json,'id','total_number') t1 as id,num > lateral view explode(json_array_me(json,"ids")) t2 as ids1; OK username age sex id num ids1 user1 18 male 1 3 101 user1 18 male 1 3 102 user1 18 male 1 3 103 user2 28 female 2 3 201 user2 28 female 2 3 202 user2 28 female 2 3 203 user3 38 male 3 4 301 user3 38 male 3 4 302 user3 38 male 3 4 303 user3 38 male 3 4 304 user4 48 female 4 5 401 user4 48 female 4 5 402 user4 48 female 4 5 403 user4 48 female 4 5 404 user4 48 female 4 5 405 user5 58 male 5 3 501 user5 58 male 5 3 502 user5 58 male 5 3 503 user6 68 female 6 3 601 user6 68 female 6 3 602 user6 68 female 6 3 603 user7 78 male 7 3 701 user7 78 male 7 3 702 user7 78 male 7 3 703 Time taken: 0.568 seconds, Fetched: 24 row(s)三.对于纯json文件可以用SerDe解析
创建表可以指定表的序列化方式
TextFile:org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
OrcSerde:org.apache.hadoop.hive.ql.io.orc.OrcSerde
Parquet:org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
例如:指定序列化方式org.apache.hive.hcatalog.data.JsonSerDe
[root@Linux122 source]# cat json_data.txt {"id":1,"ids":[101,102,103],"total_number":3} {"id":2,"ids":[201,202,203],"total_number":3} {"id":3,"ids":[301,302,303,304],"total_number":4} {"id":4,"ids":[401,402,403,404,405],"total_number":5} {"id":5,"ids":[501,502,503],"total_number":3} {"id":6,"ids":[601,602,603],"total_number":3} {"id":7,"ids":[701,702,703],"total_number":3} hive (ods)> create table json2( > id int, > ids array, > total_number int) > row format serde 'org.apache.hive.hcatalog.data.JsonSerDe'; OK Time taken: 0.702 seconds hive (ods)> load data local inpath '/root/data/source/json_data.txt' into table json2; Loading data to table ods.json2 OK Time taken: 17.651 seconds hive (ods)> select * from json2; OK json2.id json2.ids json2.total_number 1 ["101","102","103"] 3 2 ["201","202","203"] 3 3 ["301","302","303","304"] 4 4 ["401","402","403","404","405"] 5 5 ["501","502","503"] 3 6 ["601","602","603"] 3 7 ["701","702","703"] 3 Time taken: 0.94 seconds, Fetched: 7 row(s)
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)