Hive基础知识_随笔

Hive基础知识 1.数仓概念的理解

数据仓库，英文名叫data warehouse. 主要使用来存储历史数据的。
数仓一般按主题划分，整合了不通数据源的所有数据，存储的一般是历史数据，基本不会有什么变化，数据量会随着时间增量变化。

与传统业务数据库不同，传统数据库主要是面向事务，数仓主要是面向分析的，也就是主要有利于数据分析，就是可取的方向。

数据仓库的分层大致为数据源，ODS(细节数据), DW(数仓) , DA(数据应用)。

ETL将数据源数据抽取，转换，加载到数据仓库，用于数据分析。

2. Hive的理解

Hive是构建在hadoop之上的数据仓库框架。
Hive的核心是通过将HFDS上文件，映射成一张表，并提供SQL查询的功能。
Hive的本质是将SQL转换为MapReduce任务，在hadoop上执行。
可以简单理解为Hive是MapReduce的一个客户端。

2.1 Hive架构

Hive架构主要包括

Hive客户端
比如Hive Cli, Beeline，java, python客户端
metastore
主要用于存储元数据，一般存储在mysql
driver
1. SQL Parser
2. Physical Plan
3. Query Optimizer
4. Execution
存储和计算
1. 主要存储在HDFS
2. 使用MapReduce计算, 也可以使用spark和tez

2.2 Hive的安装

推荐使用远程模式启动。本教程使用2.1.0

metastore服务
hiveserver2
mysql
(略)

启动方式1 hive-cli：

hive   # 进入hive
hive -e "show databases;" # 执行sql 
hive -f ./hive.sql # 执行sql文件

启动方式2 beeline：

beeline

2.3 数据库 *** 作

create database if not exists test01;
desc database test01;
drop database test01;
drop database test01 cascade;
-------------------------------------------
use myhive;

-- 创建外部表-teacher表
create external table teacher (tid string,tname string)
row format delimited fields terminated by 't';

-- 创建外部表-student表
create external table student
(
    sid    string,
    sname  string,
    sbirth string,
    ssex   string
)
    row format delimited fields terminated by 't';

-- 从本地文件系统向表中加载数据，就是将本地的文件复制一份到HDFS的表目录下

-- 向student表加载数据,就是将本地的文件复制一份到HDFS的表目录下
load data local inpath '/export/data/hivedata/student.txt' into table student;
-- 向student表加载数据,并覆盖原来数据
load data local inpath '/export/data/hivedata/student.txt' overwrite into table student;
-- 查询student表
select * from student;

-- 删除表
drop table student;


-- 从HDFS向teacher表加载数据 ,本质上做的是剪切，将原来的文件移到到表目录文件
 -- 首先需要将teacher.txt上传到HDFS(/hivedatas)
load data inpath '/hivedatas/teacher.txt' into table  teacher;

select  * from teacher;

drop table teacher;


-- 加载weblog数据
drop table if exists ods_weblog_origin;
create table weblog_origin(
    valid string,
    remote_addr string,
    remote_user string,
    time_local string,
    request string,
    status string,
    body_bytes_sent string,
    http_referer string,
    http_user_agent string
)
row format delimited fields terminated by 't';


load data local inpath '/export/data/hivedata/part-r-00000' into table  weblog_origin;

select * from weblog_origin;

-- 演示三张表共享一份数据
drop table if exists weblog_origin1;
create external table weblog_origin1
(
    valid           string,
    remote_addr     string,
    remote_user     string,
    time_local      string,
    request         string,
    status          string,
    body_bytes_sent string,
    http_referer    string,
    http_user_agent string
)
row format delimited fields terminated by 't'
location '/hivedatas/weblog_origin';


drop table if exists weblog_origin2;
create external table weblog_origin2
(
    valid           string,
    remote_addr     string,
    remote_user     string,
    time_local      string,
    request         string,
    status          string,
    body_bytes_sent string,
    http_referer    string,
    http_user_agent string
)
    row format delimited fields terminated by 't'
    location '/hivedatas/weblog_origin';


drop table if exists weblog_origin3;
create external table weblog_origin3
(
    valid           string,
    remote_addr     string,
    remote_user     string,
    time_local      string,
    request         string,
    status          string,
    body_bytes_sent string,
    http_referer    string,
    http_user_agent string
)
    row format delimited fields terminated by 't'
    location '/hivedatas/weblog_origin';


select * from weblog_origin1;
select * from weblog_origin2;
select * from weblog_origin3;


drop table weblog_origin1; -- 外部表删除，只删除元数据
drop table weblog_origin2;
drop table weblog_origin3;

select count(*) from weblog_origin1; -- 11893   23786



-- -- -- -- -- -- -- -- -- -- -- -- -- 复杂类型-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

-- 1、Array类型
create external table hive_array
(
    name           string,
    work_locations array
)
row format delimited fields terminated by 't'
collection items terminated by ',';

load data local inpath '/export/data/hivedata/array.txt' into table hive_array;

select * from hive_array;

-- 查询
-- 查询所有数据
select * from hive_array;
-- 查询work_locations数组中第一个元素
select name, work_locations[0] location from hive_array;
-- 查询location数组中元素的个数
select name, size(work_locations) location_size from hive_array;
-- 查询location数组中包含tianjin的信息
select * from hive_array where array_contains(work_locations,'tianjin');

-- 2、Map类型

create table hive_map
(
    id      int,
    name    string,
    members map,
    age     int
)
row format delimited
fields terminated by ','
collection items terminated by '#'
map keys terminated by ':';

load data local inpath '/export/data/hivedata/map.txt' into table hive_map;

select * from hive_map;

select * from hive_map;
-- 根据键找对应的值
select id, name, members['father'] father, members['mother'] mother, age from hive_map;

-- 获取所有的键
select id, name, map_keys(members) as relation from hive_map;
-- 获取所有的值
select id, name, map_values(members) as relation from hive_map;
-- 获取键值对个数
select id,name,size(members) num from hive_map;
-- 获取有指定key的数据
select * from hive_map where array_contains(map_keys(members), 'brother');

-- 查找包含brother这个键的数据，并获取brother键对应的值
select id, name, members['brother'] brother
from hive_map
where array_contains(map_keys(members), 'brother');

-- 3、Struct类型

create table hive_struct
(
    ip   string,
    info struct
)
row format delimited
fields terminated by '#'
collection items terminated by ':';

load data local inpath '/export/data/hivedata/struct.txt' into table hive_struct;

select * from hive_struct;
--根据struct来获取指定的成员的值
select ip, info.name from hive_struct;
select ip, info.name ,info.age from hive_struct;


-- #################分区表#############
-- 分区就是分文件夹
-- month表示分区字段，以后文件夹的名字就是： month=xxx
create table score
(
    sid    string,
    cid    string,
    sscore int
) partitioned by (month string) row format delimited fields terminated by 't';

-- 分区表在加载数据时，必须制定你要将这些数据放在哪个文件夹下
load data local inpath '/export/data/hivedata/score.txt' overwrite into table score partition(month='202101');
load data local inpath '/export/data/hivedata/score.txt' overwrite into table score partition(month='202102');


select * from score;

select * from score where month = '202102';

desc score;


-- 创建多级分区表
create table score2
(
    sid    string,
    cid    string,
    sscore int
) partitioned by (year string,month string,day string) row format delimited fields terminated by 't';

load data local inpath '/export/data/hivedata/score.txt'
     into table score2 partition(year='2021',month='01',day='01');

load data local inpath '/export/data/hivedata/score.txt'
    into table score2 partition(year='2021',month='01',day='02');


load data local inpath '/export/data/hivedata/score.txt'
    into table score2 partition(year='2021',month='02',day='01');

load data local inpath '/export/data/hivedata/score.txt'
    into table score2 partition(year='2022',month='01',day='01');

select * from score2;

desc score2;

select * from score2 where year = '2021' and month = '01' and day='01';



-- 查看表的所有分区
show  partitions  score;

show  partitions  score2;


-- 添加分区
alter table score add partition(month='202008');
-- 同时添加多个分区
alter table score add partition(month='202009') partition(month = '202010');

-- 删除分区
alter table score drop partition(month = '202010');

insert into  table score partition(month = '202011') values(1,1,1);

欢迎分享，转载请注明来源：内存溢出

原文地址: https://outofmemory.cn/zaji/5136087.html

Hive基础知识

发表评论

评论列表（0条）