2022.1.14离线Hive数据仓库学习 归一化分析

2022.1.14离线Hive数据仓库学习 归一化分析,第1张

2022.1.14离线Hive数据仓库学习 归一化分析
--  (x - min) / (max - min)  归一化
create temporary macro percentagevalue(value double, min_value double, max_value double)
    case
        when value is null or min_value = max_value then 0.5
        else (value-min_value)/(max_value-min_value)
        end;

desc user_friend_event;

create table train_date
    row format delimited
fields terminated by ","
stored as textfile
as
with pre_tain_data as
         (
             select min(user_age)                      as user_min_age,
                    max(user_age)                      as user_max_age,
                    min(user_member_days)              as user_min_member_days,
                    max(user_member_days)              as user_max_member_days,
                    min(user_friend_count)             as user_min_friend_count,
                    max(user_friend_count)             as user_max_friend_count,
                    min(user_invited_event_count)      as user_min_invited_event_count,
                    max(user_invited_event_count)      as user_max_invited_event_count,
                    min(invite_ahead_days)             as invited_min_ahead_days,
                    max(invite_ahead_days)             as invited_max_ahead_days,
                    min(event_city_level)              as event_min_city_level,
                    max(event_city_level)              as event_max_city_level,
                    min(event_country_level)           as event_min_country_level,
                    max(event_country_level)           as event_max_country_level,
                    min(event_lat)                     as event_min_lat,
                    max(event_lat)                     as event_max_lat,
                    min(event_lng)                     as event_min_lng,
                    max(event_lng)                     as event_max_lng,
                    min(event_start_month)             as event_min_start_month,
                    max(event_start_month)             as event_max_start_month,
                    min(event_start_dayofweek)         as event_min_start_dayofweek,
                    max(event_start_dayofweek)         as event_max_start_dayofweek,
                    min(event_start_hour)              as event_min_start_hour,
                    max(event_start_hour)              as event_max_start_hour,
                    min(start_ahead_days)              as start_min_ahead_days,
                    max(start_ahead_days)              as start_max_ahead_days,
                    min(invited_friends_count)         as invited_min_friends_count,
                    max(invited_friends_count)         as invited_max_friends_count,
                    min(maybe_friends_count)           as maybe_min_friends_count,
                    max(maybe_friends_count)           as maybe_max_friends_count,
                    min(yes_friends_count)             as yes_min_friends_count,
                    max(yes_friends_count)             as yes_max_friends_count,
                    min(no_friends_count)              as no_min_friends_count,
                    max(no_friends_count)              as no_max_friends_count,
                    min(user_had_event_count)          as user_min_had_event_count,
                    max(user_had_event_count)          as user_max_had_event_count,
                    min(user_maybe_attend_event_count) as user_min_maybe_attend_event_count,
                    max(user_maybe_attend_event_count) as user_max_maybe_attend_event_count,
                    min(user_no_attend_event_count)    as user_min_no_attend_event_count,
                    max(user_no_attend_event_count)    as user_max_no_attend_event_count,
                    min(user_yes_attend_event_count)   as user_min_yes_attend_event_count,
                    max(user_yes_attend_event_count)   as user_max_yes_attend_event_count,
                    min(user_locale)                   as user_min_locale,
                    max(user_locale)                   as user_max_locale,
                    min(user_time_zone)                as user_min_time_zone,
                    max(user_time_zone)                as user_max_time_zone,
                    min(event_invited_user_count)      as event_min_invited_user_count,
                    max(event_invited_user_count)      as event_max_invited_user_count,
                    min(event_yes_user_count)          as event_min_yes_user_count,
                    max(event_yes_user_count)          as event_max_yes_user_count,
                    min(event_no_user_count)           as event_min_no_user_count,
                    max(event_no_user_count)          as event_max_no_user_count,
                    min(event_maybe_user_count)        as event_min_maybe_user_count,
                    max(event_maybe_user_count)          as event_max_maybe_user_count
             from user_friend_event
         )
select
    t.userid,
    t.eventid,
    t.user_interested,
    t.user_invited,
    percentagevalue(t.event_city_level, s.event_min_city_level, s.event_max_city_level) as event_city_level,
    percentagevalue(t.event_country_level, s.event_min_country_level, s.event_max_country_level) as event_country_level,
    percentagevalue(t.event_lat, s.event_min_lat, s.event_max_lat) as event_lat,
    percentagevalue(t.event_lng, s.event_min_lng, s.event_max_lng) as event_lng,
    percentagevalue(t.invite_ahead_days, s.invited_min_ahead_days, s.invited_max_ahead_days) as invite_ahead_days,
    percentagevalue(t.event_start_month, s.event_max_start_month, s.event_min_start_month) as event_start_month,
    percentagevalue(t.event_start_dayofweek, s.event_max_start_dayofweek, s.event_min_start_dayofweek) as event_start_dayofweek,
    percentagevalue(t.event_start_hour, s.event_max_start_hour, s.event_min_start_hour) as event_start_hour,
    percentagevalue(t.start_ahead_days, s.start_min_ahead_days, s.start_max_ahead_days) as start_ahead_days,
    percentagevalue(t.user_friend_count, s.user_min_friend_count, s.user_max_friend_count) as user_friend_count,
    t.event_creator_is_friend,
    percentagevalue(t.invited_friends_count, s.invited_min_friends_count, s.invited_max_friends_count) as invited_friends_count,
    percentagevalue(t.maybe_friends_count, s.maybe_min_friends_count, s.maybe_max_friends_count) as maybe_friends_count,
    percentagevalue(t.yes_friends_count, s.yes_min_friends_count, s.yes_max_friends_count) as yes_friends_count,
    percentagevalue(t.no_friends_count, s.no_min_friends_count, s.no_max_friends_count) as no_friends_count,
    t.invited_friends_percentage,
    t.yes_friends_percentage,
    t.no_friends_percentage,
    t.maybe_friends_percentage,
    percentagevalue(t.user_age, s.user_min_age, s.user_max_age) as user_age,
    t.user_gender,
    percentagevalue(t.user_member_days, s.user_min_member_days, s.user_max_member_days) as user_member_days,
    t.location_similar,
    percentagevalue(t.user_had_event_count, s.user_min_had_event_count, s.user_max_had_event_count) as user_had_event_count,
    percentagevalue(t.user_invited_event_count, s.user_min_invited_event_count, s.user_max_invited_event_count) as user_invited_event_count,
    percentagevalue(t.user_maybe_attend_event_count, s.user_min_maybe_attend_event_count, s.user_max_maybe_attend_event_count) as user_maybe_attend_event_count,
    percentagevalue(t.user_no_attend_event_count, s.user_min_no_attend_event_count, s.user_max_no_attend_event_count) as user_no_attend_event_count,
    percentagevalue(t.user_yes_attend_event_count, s.user_min_yes_attend_event_count, s.user_max_yes_attend_event_count) as user_yes_attend_event_count,
    percentagevalue(t.user_locale, s.user_min_locale, s.user_max_locale) as user_locale,
    percentagevalue(t.user_time_zone, s.user_min_time_zone, s.user_max_time_zone) as user_time_zone,
    percentagevalue(t.event_invited_user_count, s.event_min_invited_user_count, s.event_max_invited_user_count) as event_invited_user_count,
    percentagevalue(t.event_yes_user_count, s.event_min_yes_user_count, s.event_max_yes_user_count) as event_yes_user_count,
    percentagevalue(t.event_no_user_count, s.event_min_no_user_count, s.event_max_no_user_count) as event_no_user_count,
    percentagevalue(t.event_maybe_user_count, s.event_min_maybe_user_count, s.event_max_maybe_user_count) as event_maybe_user_count
from dm_attendee.user_friend_event t cross join pre_tain_data s;


select count(*) from train_date;
select * from user_friend_event where user_locale is null ;

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/zaji/5704835.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-12-17
下一篇 2022-12-17

发表评论

登录后才能评论

评论列表(0条)

保存