2022.1.11离线Hive数据仓库学习 dwd

2022.1.11离线Hive数据仓库学习 dwd,第1张

2022.1.11离线Hive数据仓库学习 dwd
离线Hive数据仓库学习
hbase(main):005:0> count 'events_db:train'

flume -> kafka -> hbase -> hive

[root@gree139 hive110]# nohup ./bin/hive --service hiveserver2 &

[root@gree139 hive110]# nohup ./bin/hive --service metastore &

//避免在map阶段聚合

set hive.auto.convert.join=false

确认业务 --> 声明粒度 --> 确认维度 --> 确认事实

ODS	数据准备区	贴源层	//原始数据
DWD	细节数据层	// 脱敏/清洗
DWS	数据服务层	//轻度汇总	聚合
DM	//大宽表,基于某个业务,把不同维度的信息通过粒度聚合起来

DWT	主题层	大宽表
ADS	//数据应用层,精确数据,为不同用户所使用
//创建
create database dwd_events;

use dwd_events;

//  处理 user ,将hbase中的users表映射到hive中,再以orc格式保存到hive表中
create external table hb_users(
    userid string,
    birthyear int,
    gender string,
    locale string,
    joinedAt string,
    location string,
    timezone string
)stored by 'org.apache.hadoop.hive.hbase.HbaseStorageHandler' with serdeproperties
('hbase.columns.mapping'=':key,profile:birthyear,profile:gender,region:locale,registration:joinedAt,region:location,region:timezone')
tblproperties ('hbase.table.name'='events_db:users');

select * from hb_users;

create table users stored as orc as
    select * from hb_users;

select * from users;
drop table if exists hb_users;

// events 将hbase中 events表数据映射到hive表中,并将以orc格式保存
create external  table hb_events(
    eventid string,
    user_id string,
    city string,
    state string,
    zip string,
    country string,
    lat float,
    lng float,
    common_words string,
    start_time string
) stored by 'org.apache.hadoop.hive.hbase.HbaseStorageHandler'
with serdeproperties ('hbase.columns.mapping'=":key,creator:user_id,location:city,location:state,location:zip,location:country,location:lat,location:lng,remark:common_words,schedule:start_time")
tblproperties ('hbase.table.name'='events_db:events');

select * from hb_events;

create table events stored as orc as
    select  * from hb_events;

select * from events;

drop table if exists hb_events;

show tables;

//eventattend
create external table hb_event_attendee(
    row_key string,
    eventid string,
    friendid string,
    state string
) stored by 'org.apache.hadoop.hive.hbase.HbaseStorageHandler'
with serdeproperties ('hbase.columns.mapping'=':key, euat:eventid,euat:friendid,euat:state')
tblproperties ('hbase.table.name'='events_db:event_attendee');

create table event_attendee stored as orc as
    select * from hb_event_attendee;

select * from event_attendee;
drop table if exists hb_event_attendee;

// userfriend
create external table hb_user_friend(
    row_key string,
    userid string,
    friendid string
) stored by 'org.apache.hadoop.hive.hbase.HbaseStorageHandler'
with serdeproperties ('hbase.columns.mapping'=':key, uf:userid,uf:friendid')
tblproperties ('hbase.table.name'='events_db:user_friend');

drop table if exists user_friend;
create table user_friend stored as orc as
    select * from hb_user_friend;

drop table if exists hb_user_friend;

//  train
create external table hb_train(
    rowkey string,
    userid string,
    eventid string,
    invited string,
    timestamp string,
    interested string
) stored by 'org.apache.hadoop.hive.hbase.HbaseStorageHandler'
with serdeproperties ('hbase.columns.mapping'=':key, eu:user,eu:event,eu:invited,eu:timestamp,eu:interested')
tblproperties ('hbase.table.name'='events_db:train');

select * from hb_train where userid!='user';

use dwd_events;

drop table  if exists train;
create table train stored as orc as
    select * from hb_train where userid!='user';

select count(*) from train;

select * from train where userid='user';

drop table if exists hb_train;


create external  table ods_local(
    local_id int,
    locale string
) row format delimited fields terminated by "t"
location '/events/ods/locale';
select * from ods_local;

create table local stored as orc as
    select * from ods_local;

select * from local;
drop table if exists ods_local;

// time_zone
create external table ods_time_zone(
    time_zone_id int,
    time_zone string
)row format delimited fields terminated by ','
location '/events/ods/timezone';

select * from ods_time_zone;
create table time_zone stored as orc as
    select * from ods_time_zone;

drop table if exists ods_time_zone;
select * from time_zone;

use dws_events;
-- user_attend_event_count
-- 基本event_attendee 统计 每个user 分别接受到的事件邀请的个数,参加事件的个数,拒绝事件的个数,可能参加事件的个数
-- userid  invited_count  maybe_attend_count  no_attend_count  yes_attend_count
-- 111       12             0                   0                  12
-- 222       10             0                   10                  0
create table user_attend_event_count stored as orc as
    select attend_user_id as userid,
           sum(invited) as invited_count,
           sum(maybe_attend) as maybe_attend_count,
           sum(no_attend) as no_attend_count,
           sum(yes_attend) as yes_attend_count
           from user_event_state
 group by attend_user_id;








 

欢迎分享,转载请注明来源:内存溢出

原文地址: https://outofmemory.cn/zaji/5704794.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-12-17
下一篇 2022-12-17

发表评论

登录后才能评论

评论列表(0条)

保存