离线Hive数据仓库学习 hbase(main):005:0> count 'events_db:train' flume -> kafka -> hbase -> hive [root@gree139 hive110]# nohup ./bin/hive --service hiveserver2 & [root@gree139 hive110]# nohup ./bin/hive --service metastore & //避免在map阶段聚合 set hive.auto.convert.join=false 确认业务 --> 声明粒度 --> 确认维度 --> 确认事实 ODS 数据准备区 贴源层 //原始数据 DWD 细节数据层 // 脱敏/清洗 DWS 数据服务层 //轻度汇总 聚合 DM //大宽表,基于某个业务,把不同维度的信息通过粒度聚合起来 DWT 主题层 大宽表 ADS //数据应用层,精确数据,为不同用户所使用
//创建 create database dwd_events; use dwd_events; // 处理 user ,将hbase中的users表映射到hive中,再以orc格式保存到hive表中 create external table hb_users( userid string, birthyear int, gender string, locale string, joinedAt string, location string, timezone string )stored by 'org.apache.hadoop.hive.hbase.HbaseStorageHandler' with serdeproperties ('hbase.columns.mapping'=':key,profile:birthyear,profile:gender,region:locale,registration:joinedAt,region:location,region:timezone') tblproperties ('hbase.table.name'='events_db:users'); select * from hb_users; create table users stored as orc as select * from hb_users; select * from users; drop table if exists hb_users; // events 将hbase中 events表数据映射到hive表中,并将以orc格式保存 create external table hb_events( eventid string, user_id string, city string, state string, zip string, country string, lat float, lng float, common_words string, start_time string ) stored by 'org.apache.hadoop.hive.hbase.HbaseStorageHandler' with serdeproperties ('hbase.columns.mapping'=":key,creator:user_id,location:city,location:state,location:zip,location:country,location:lat,location:lng,remark:common_words,schedule:start_time") tblproperties ('hbase.table.name'='events_db:events'); select * from hb_events; create table events stored as orc as select * from hb_events; select * from events; drop table if exists hb_events; show tables; //eventattend create external table hb_event_attendee( row_key string, eventid string, friendid string, state string ) stored by 'org.apache.hadoop.hive.hbase.HbaseStorageHandler' with serdeproperties ('hbase.columns.mapping'=':key, euat:eventid,euat:friendid,euat:state') tblproperties ('hbase.table.name'='events_db:event_attendee'); create table event_attendee stored as orc as select * from hb_event_attendee; select * from event_attendee; drop table if exists hb_event_attendee; // userfriend create external table hb_user_friend( row_key string, userid string, friendid string ) stored by 'org.apache.hadoop.hive.hbase.HbaseStorageHandler' with serdeproperties ('hbase.columns.mapping'=':key, uf:userid,uf:friendid') tblproperties ('hbase.table.name'='events_db:user_friend'); drop table if exists user_friend; create table user_friend stored as orc as select * from hb_user_friend; drop table if exists hb_user_friend; // train create external table hb_train( rowkey string, userid string, eventid string, invited string, timestamp string, interested string ) stored by 'org.apache.hadoop.hive.hbase.HbaseStorageHandler' with serdeproperties ('hbase.columns.mapping'=':key, eu:user,eu:event,eu:invited,eu:timestamp,eu:interested') tblproperties ('hbase.table.name'='events_db:train'); select * from hb_train where userid!='user'; use dwd_events; drop table if exists train; create table train stored as orc as select * from hb_train where userid!='user'; select count(*) from train; select * from train where userid='user'; drop table if exists hb_train; create external table ods_local( local_id int, locale string ) row format delimited fields terminated by "t" location '/events/ods/locale'; select * from ods_local; create table local stored as orc as select * from ods_local; select * from local; drop table if exists ods_local; // time_zone create external table ods_time_zone( time_zone_id int, time_zone string )row format delimited fields terminated by ',' location '/events/ods/timezone'; select * from ods_time_zone; create table time_zone stored as orc as select * from ods_time_zone; drop table if exists ods_time_zone; select * from time_zone; use dws_events; -- user_attend_event_count -- 基本event_attendee 统计 每个user 分别接受到的事件邀请的个数,参加事件的个数,拒绝事件的个数,可能参加事件的个数 -- userid invited_count maybe_attend_count no_attend_count yes_attend_count -- 111 12 0 0 12 -- 222 10 0 10 0 create table user_attend_event_count stored as orc as select attend_user_id as userid, sum(invited) as invited_count, sum(maybe_attend) as maybe_attend_count, sum(no_attend) as no_attend_count, sum(yes_attend) as yes_attend_count from user_event_state group by attend_user_id;
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)