Hive 笔记 | 不可思议

# Hive 笔记

# Hive 工作原理

Hive 工作原理

# 特点

普通表：表结构元数据及数据在 HDFS 中
外部表：表结构元数据在 HDFS 中，实际数据在表结构 LOCATION 参数指定的文件路径中

# Hive SQL

# 数据类型

列类型
- 整型
  - TINYINT, 后缀 Y
  - SMALLINT, 后缀 S
  - INT
  - BIGINT, 后缀 L
- 字符串
  - VARCHAR, 长度 1~65535
  - CHAR, 长度 255
- 时间
  - 支持 UNIX 时间戳（可选纳秒的精度）
  - 支持 java.sql.Timestamp 格式: YYYY-MM-DD HH:MM:SS.ffffffffff 和 YYYY-MM-DD HH:MM:ss.ffffffffff
- 日期: 如 YYYY-MM-DD
- 小数: DECIMAL(precision, scale), 例 decimal(10, 0)
- 联合类型: 是异类的数据类型的集合
  - 语法: UNIONTYPE<int, double, array<string>, struct<a: int, b: string>>
  - 示例:{0:1}, {1:2.0}, {2:["three", "four"]}, {3:{"a": 5, "b": "five"}}
文字
- 浮点类型
- 十进制类型
Null 值: 缺少值通过特殊值 Null 表示
复杂类型
- 数组: Array<dataType>
- 映射: Map<primitiveType, dataType>
- 结构体: Struct<colName, dataType [COMMENT colComment], ...>

# 内置运算符

关系运算符
- =, !=
- <, >
- <=, >=
- IS NULL, IS NOT NULL
- LIKE, RLIKE
- REGEXP（等同于 RLIKE）
算术运算符
- +, -, *, /, %
- &, |, ^, ~
逻辑运算符
- AND, &&
- OR, ||
- NOT, !
复杂运算符
- A[n] 访问数组 A 第 n 个元素（n>=0)
- M[key] 访问映射中关键字 key 的值
- S.x 访问结构 S 的字段 s 值

# 内置函数

round, floor, ceil, rand
concat, substr
upper, ucase, lower, lcase
trim, ltrim, rtrime
regexp_replace
size
cast
from_unixtime, to_date, year, month, day
get_json_object
聚合函数: count, sum, avg, min, max

# DML

# 操作数据库

-- 创建数据库语法: CREATE DATABASE|SCHEMA [IF NOT EXISTS] <databaseName>
-- 示例:
create database [if not exists] userdb;
-- 或
create schema userdb;

-- 查看数据库
show databases;

-- 删了数据库语法: DROP DATABASE StatementDROP (DATABASE|SCHEMA) [IF EXISTS] databaseName [RESTRICT|CASCADE];
-- 示例:
drop database if exists userdb;
drop database if exists userdb cascade;  -- CASCADE 意味着需求删除所有相应表才能删库
drop schema userdb;

# 操作表

-- 创建表语法:
create [TEMPORARY] [EXTERNAL] table [IF NOT EXISTS] [dbName.] tableName
[(colName datatType [COMMENT colComment], ...)]
[COMMENT tableComment]
[ROW FORMAT rowFormat]
[STORED AS fileFormat]

-- 示例:
create table if not exists employee
(
  eid int,
  name String,
  salary String,
  desigination String
)
comment 'Employee details'
row format delimited fields terminated by '\t' lines terminated by '\n'
stored as textfile;

-- 修改表语法:
alert table tableName RENAME TO newTableName;
alert table tableName ADD COLUMNS (colSpec[, colSpec ...]);
alert table tableName DROP [COLUMN] columnName;
alert table tableName CHANGE columnName newColumnName newType;
alert table tableName REPLACE COLUMNS (colSpec[, colSpec ...]);


-- 示例:
alert table employee rename to emp;  -- 修改表名

alert table employee change name ename String;  -- 更改字段
alert table employee change salary salary Double;

alert table employee add columns(
  dept String COMMENT 'Department name')
);  -- 添加字段

alert table employee replace columns(
  eid int empid int,
  ename string name string
);  -- 替换列???

-- 删除表语法: DROP TABLE [IF EXISTS] tableName;

-- 示例:
drop table if exists employee;

-- 查看表
show tables;

# 加载数据

load data [LOCAL] inpath 'filePath' [OVERWRITE] into table tableName
[PARTITION (partCol1=val1, partCol2=val2, ...)]

-- 示例:
load data local inpath '/home/user/sample.txt' overwrite into table employee;

# 分区操作

-- 添加分区
alert table tableName
ADD [if not exists] PARTITION
partitionSpec [LOCATION 'location1']
partitionSpec [LOCATION 'location2'] ...;

partitionSpec:
(pColumn = pColValue, pColumn = pColValue, ...)
-- 示例
alert table employee
ADD PARTITION
(year='2013') location '/2012/part2012';


-- 重命名分区
alert table tableName
PARTITION partitionSpec RENAME TO PARTITION partitionSpec;
-- 示例
alert table employee PARTITION (year='1203') RENAME TO PARTITION (Yoj='1203');


-- 删除分区
alert table tableName
DROP [IF EXISTS] PARTITION partitionSpec, PARTITION partitionSpec, ...;
-- 示例
alert table employee drop [if exists] partition (year='1203');

# 视图操作

-- 创建视图语法:
create VIEW [if not exists] viewName [(columnName [COMMENT columnComment], ...)]
[COMMENT tableComment]
AS select ...

-- 示例:
create view emp_30000 AS
select * from employee
where salary > 30000;

-- 删除视图语法:
drop VIEW viewName;

-- 示例:
drop view emp_30000;

# 索引操作

-- 创建索引语法:
create index indexName
on table baseTableName (colName, ...)
as 'index.handler.class.name'
[with deferred rebuild]
[idxproperties (propertyName=propertyValue, ...)]
[in table indexTableName]
[partitioned by (colName, ...)]
[
  [row format ...] stored as ...
  | stored by ...
]
[location hdfsPath]
[tblproperties (...)]

-- 示例:
create index index_salary on table employee(salary)
as 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler';

-- 删除索引语法:
drop index indexName on tableName;

-- 示例:
drop index indexSalary on employee;

# DQL

# select

select [ALL | DISTINCT] selectExpr, selectExpr, ...
from tableReference
[WHERE whereCondition]
[GROUP BY colList]
[HAVING havingCondition]
[CLUSTER BY colList | [DISTRIBUTE BY colList] [SORT BY colList]]
[ORDER BY colList]
[LIMIT number];

-- 示例:
select * from employee where salary > 30000;
select Id, Name, Dept from employee order by dept;
select Dept, count(*) from employee group by dept;

# join

语法

tableReference JOIN tableFactor [joinCondition]
| tableReference {LEFT|RIGHT|FULL} [OUTER] JOIN tableReference
joinCondition
| tableReference LEFT SEMI JOIN tableReference joinCondition
| tableReference CROSS JOIN tableReference [joinCondition]

示例

select c.Id, c.Name, c.Age, o.Amount
from customers c JOIN orders o
on (o.customer_id = c.Id);


select c.Id, c.Name, c.Age, o.Amount
from customers c LEFT OUTER JOIN orders o
on (o.customer_id = c.Id);

select c.Id, c.Name, c.Age, o.Amount
from customers c RIGHT OUTER JOIN orders o
on (o.customer_id = c.Id);

-- full outer join 连接表包含两个表的所有记录（缺少匹配的一侧为 NULL）
select c.Id, c.Name, c.Age, o.Amount
from customers c FULL OUTER JOIN orders o
on (o.customer_id = c.Id);

# 参考

hive(数据仓库工具) https://baike.baidu.com/item/hive/67986?fr=aladdin
Hive 教程 https://www.yiibai.com/hive

# 记录

2019/12/28 16:23 周六创建

ODPS