DML语句
load
官网语法 LOAD DATA [LOCAL] INPATH ‘filepath’ [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 …)]
create table dept(
deptno int,
dname string,
location string
) row format delimited fields terminated by '\t';
覆盖 overwrite
LOAD DATA LOCAL INPATH '/home/hadoop/data/dept.txt' OVERWRITE INTO TABLE dept;
追加,local是指在Linux中
LOAD DATA LOCAL INPATH '/home/hadoop/data/dept.txt' INTO TABLE dept;
去掉local,指的是在HDFS中
LOAD DATA INPATH '/hive/dept/dept.txt' OVERWRITE INTO TABLE dept;
发现是把HDFS中的数据移动到了hive中(/user/hive/warehouse/hive.db/table)
[hadoop@hadoop001 conf] hdfs dfs -ls /hive/dept/
Found 1 items
-rw-r--r-- 1 hadoop supergroup 80 2019-03-23 02:34 /hive/dept/dept.txt
[hadoop@hadoop001 conf]
[hadoop@hadoop001 conf] hdfs dfs -ls /hive/dept/
[hadoop@hadoop001 conf]
把表的数据写到Linux文件中,她会自动去创建目录,如果想写到HDFS中,则去掉local,她也会自动去创建目录;会执行MapReduce
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/tmp/d6/emptmp'
row format delimited fields terminated by ','
SELECT empno,ename FROM emp;
在Linux中查看
[hadoop@hadoop001 emptmp] pwd
/home/hadoop/tmp/d6/emptmp
[hadoop@hadoop001 emptmp] cat 000000_0
7369,SMITH
7499,ALLEN
7521,WARD
7566,JONES
7654,MARTIN
7698,BLAKE
7782,CLARK
7788,SCOTT
7839,KING
7844,TURNER
7876,ADAMS
7900,JAMES
7902,FORD
7934,MILLER
8888,HIVE
注意:在hive中不要用INSERT INTO TABLE tablename VALUES 这种方式加载数据,因为会产生很多小文件
在生产中是用Sqoop或者Spark SQL
create table xxx as SELECT empno,ename FROM emp;
还要注意幂等性,用不同的方式处理,但是处理得出的结果要相同。
SQL
hive -e 直接在命令行执行SQL
[hadoop@hadoop001 ~]
[hadoop@hadoop001 ~] hive -e 'use d6_hive;select * from dept'
which: no hbase in (/home/hadoop/app/hive-1.1.0-cdh5.7.0/bin:/home/hadoop/app/hadoop-2.6.0-cdh5.7.0/bin:/home/hadoop/app/hadoop-2.6.0-cdh5.7.0/sbin:/usr/java/jdk1.8.0_144/bin:/bin:/usr/lib64/qt-3.3/bin:/usr/local/bin:/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/sbin)
Logging initialized using configuration in file:/home/hadoop/app/hive-1.1.0-cdh5.7.0/conf/hive-log4j.properties
OK
Time taken: 0.577 seconds
OK
dept.deptno dept.dname dept.location
10 ACCOUNTING NEW YORK
20 RESEARCH DALLAS
30 SALES CHICAGO
40 OPERATIONS BOSTON
Time taken: 0.52 seconds, Fetched: 4 row(s)
hive -f 把SQL写到文件中,生产中常用
[hadoop@hadoop001 ~] hive -f
which: no hbase in (/home/hadoop/app/hive-1.1.0-cdh5.7.0/bin:/home/hadoop/app/hadoop-2.6.0-cdh5.7.0/bin:/home/hadoop/app/hadoop-2.6.0-cdh5.7.0/sbin:/usr/java/jdk1.8.0_144/bin:/bin:/usr/lib64/qt-3.3/bin:/usr/local/bin:/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/sbin)
Missing argument for option: f
usage: hive
-d,--define <key=value> Variable subsitution to apply to hive
commands. e.g. -d A=B or --define A=B
--database <databasename> Specify the database to use
-e <quoted-query-string> SQL from command line
-f <filename> SQL from files
-H,--help Print help information
--hiveconf <property=value> Use value for given property
--hivevar <key=value> Variable subsitution to apply to hive
commands. e.g. --hivevar A=B
-i <filename> Initialization SQL file
-S,--silent Silent mode in interactive shell
-v,--verbose Verbose mode (echo executed SQL to the
console)
[hadoop@hadoop001 ~]
[hadoop@hadoop001 data] cat q.sql
use d6_hive;
select * from emp limit 4;
[hadoop@hadoop001 data] hive -f q.sql
which: no hbase in (/home/hadoop/app/hive-1.1.0-cdh5.7.0/bin:/home/hadoop/app/hadoop-2.6.0-cdh5.7.0/bin:/home/hadoop/app/hadoop-2.6.0-cdh5.7.0/sbin:/usr/java/jdk1.8.0_144/bin:/bin:/usr/lib64/qt-3.3/bin:/usr/local/bin:/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/sbin)
Logging initialized using configuration in file:/home/hadoop/app/hive-1.1.0-cdh5.7.0/conf/hive-log4j.properties
OK
Time taken: 0.605 seconds
OK
emp.empno emp.ename emp.job emp.mgr emp.hiredate emp.sal emp.comm emp.deptno
7369 SMITH CLERK 7902 1980-12-17 800.0 NULL 20
7499 ALLEN SALESMAN 7698 1981-2-20 1600.0 300.0 30
7521 WARD SALESMAN 7698 1981-2-22 1250.0 500.0 30
7566 JONES MANAGER 7839 1981-4-2 2975.0 NULL 20
Time taken: 0.512 seconds, Fetched: 4 row(s)
[hadoop@hadoop001 data]$
where = > >= < <=
limit
between and [] 是左闭合右闭合
(not) in
注意:这个也能查询出来,则表明底层做了一个大小写转换操作,通过查看在MySQL中的元数据信息,得出底层做的是大写转小写的操作。
hive (d6_hive)> select * from emp where Ename in ('SMITH','WARD');
OK
emp.empno emp.ename emp.job emp.mgr emp.hiredate emp.sal emp.comm emp.deptno
7369 SMITH CLERK 7902 1980-12-17 800.0 NULL 20
7521 WARD SALESMAN 7698 1981-2-22 1250.0 500.0 30
Time taken: 0.053 seconds, Fetched: 2 row(s)
聚合函数:max min sum count avg 多进一出
==> 全局
求每个部门的平均工资:
1) 拿到每个部分的信息
2) 在1)的基础之上求平均工资
select deptno,avg(sal) from emp group by deptno;
出现select中的字段要么出现在group by中 要么出现在聚合函数中
select ename,deptno,avg(sal) from emp group by deptno;
NULL 10300.0
10 2916.6666666666665
20 2175.0
30 1566.6666666666667
select deptno,avg(sal) avg_sal from emp group by deptno having avg_sal >=2000;
case when then
select ename, sal,
case
when sal > 1 and sal <=1000 then 'lower'
when sal > 1000 and sal <=2000 then 'middle'
when sal > 2000 and sal <=3000 then 'high'
else 'highest' end
from emp;
函数:
build-in 内置 外置/自定义
查看所有的内置函数
show functions
查看具体内置的函数的用法
power 是求幂运算
hive (d6_hive)> desc function power;
OK
power(x1, x2) - raise x1 to the power of x2
Time taken: 0.013 seconds, Fetched: 1 row(s)
hive (d6_hive)>
查看更加详细的用法
hive (d6_hive)> desc function extended power;
OK
power(x1, x2) - raise x1 to the power of x2
Synonyms: pow
Example:
SELECT power(2, 3) FROM src LIMIT 1;
8
Time taken: 0.019 seconds, Fetched: 5 row(s)
upper/lower:大小写转换
current_date 查看当前时间
hive (d6_hive)> select current_date from dual;
OK
2019-03-23
Time taken: 0.026 seconds, Fetched: 1 row(s)
timestamp 产看当前时间戳
hive (d6_hive)> select current_timestamp from dual;
OK
2019-03-23 12:51:16.351
Time taken: 0.033 seconds, Fetched: 1 row(s)
hive (d6_hive)>
Uninx 时间戳,工作中经常遇到的
hive (d6_hive)> select unix_timestamp() from dual;
OK
1553318732
把Uninx时间戳转换为具体的时间
hive (d6_hive)> select unix_timestamp('2019-03-23 13:25:32') from dual;
OK
1553318732
Time taken: 0.52 seconds, Fetched: 1 row(s)
还可以加上匹配的格式
hive (d6_hive)> select unix_timestamp('20190323132532','yyMMddHHmmss') from dual;
OK
1625327712
Time taken: 0.055 seconds, Fetched: 1 row(s)
hive (d6_hive)>
to_date 把时间转换为年月日格式
hive (d6_hive)> select to_date('2019-03-23 12:02:22') from dual;
OK
2019-03-23
Time taken: 0.053 seconds, Fetched: 1 row(s)
hive (d6_hive)>
date 与 timestamp的区别?date 是只取年月日,timestamp 是取到年月日时分秒
还有年,月,日,时,分,秒;下面是年的截取
hive (d6_hive)> select year('2019-03-23 12:02:22') from dual;
OK
2019
Time taken: 0.058 seconds, Fetched: 1 row(s)
对时间的加减操作
hive (d6_hive)> select date_add('2019-03-23',2) from dual;
OK
2019-03-25
Time taken: 0.461 seconds, Fetched: 1 row(s)
hive (d6_hive)> select date_add('2019-03-23',-2) from dual;
OK
2019-03-21
Time taken: 0.062 seconds, Fetched: 1 row(s)
hive (d6_hive)> select date_sub('2019-03-23',2) from dual;
OK
2019-03-21
Time taken: 0.059 seconds, Fetched: 1 row(s)
hive (d6_hive)> select date_sub('2019-03-23',-2) from dual;
OK
2019-03-25
Time taken: 0.051 seconds, Fetched: 1 row(s)
hive (d6_hive)>
数学函数
round 四舍五入
hive (d6_hive)> select round(3.4) from dual;
OK
3.0
Time taken: 0.084 seconds, Fetched: 1 row(s)
hive (d6_hive)> select round(3.5) from dual;
OK
4.0
Time taken: 0.049 seconds, Fetched: 1 row(s)
hive (d6_hive)>
ceil 向上取整;floor 向下取整
hive (d6_hive)> select ceil(5.4) from dual;
OK
6
Time taken: 0.065 seconds, Fetched: 1 row(s)
hive (d6_hive)> select floor(5.4) from dual;
OK
5
Time taken: 0.053 seconds, Fetched: 1 row(s)
hive (d6_hive)> select floor(5.8) from dual;
OK
5
Time taken: 0.047 seconds, Fetched: 1 row(s)
hive (d6_hive)> select ceil(5.1) from dual;
OK
6
Time taken: 0.043 seconds, Fetched: 1 row(s)
hive (d6_hive)>
字符串的截取:substr substring 可以加上截取字符串的长度
hive (d6_hive)> select substr('abcdefg',2) from dual;
OK
bcdefg
Time taken: 0.044 seconds, Fetched: 1 row(s)
hive (d6_hive)> select substring('abcdefg',2,4) from dual;
OK
bcde
Time taken: 0.041 seconds, Fetched: 1 row(s)
hive (d6_hive)> select substring('abcdefg',2,2) from dual;
OK
bc
Time taken: 0.043 seconds, Fetched: 1 row(s)
hive (d6_hive)> select substring('abcdefg',2) from dual;
OK
bcdefg
Time taken: 0.056 seconds, Fetched: 1 row(s)
hive (d6_hive)> select substring('abcdefg',2,2) from dual;
OK
bc
Time taken: 0.051 seconds, Fetched: 1 row(s)
hive (d6_hive)>
字符串的合并 concat
hive (d6_hive)> select concat('ab','cd','ef') from dual;
OK
abcdef
Time taken: 0.054 seconds, Fetched: 1 row(s)
hive (d6_hive)> select concat('ab','12') from dual;
OK
ab12
Time taken: 0.04 seconds, Fetched: 1 row(s)
hive (d6_hive)>
concat_ws 可以加上指定的字符串
hive (d6_hive)> select concat_ws('**','12','34') from dual;
OK
12**34
Time taken: 0.057 seconds, Fetched: 1 row(s)
hive (d6_hive)>
split 拆分 ;返回的是一个数组
hive (d6_hive)> select split('192.168.100.100:8080',':') from dual;
OK
["192.168.100.100","8080"]
Time taken: 0.067 seconds, Fetched: 1 row(s)
注意:特殊字符需要转义的
hive (d6_hive)> select split('192.168.100.100','.') from dual;
OK
["","","","","","","","","","","","","","","",""]
Time taken: 0.054 seconds, Fetched: 1 row(s)
hive (d6_hive)> select split('192.168.100.100','\.') from dual;
OK
["192","168","100","100"]
Time taken: 0.045 seconds, Fetched: 1 row(s)
hive (d6_hive)>
再次讲解 WordCount
create table hive_wc(sentence string);
load data local inpath '/home/hadoop/data/ruozeinput.txt' into table hive_wc;
单词拆分
hive (d6_hive)> select split(sentence, "\t") from hive_wc;
OK
["hello","world","hello"]
["hello","world","welcome"]
["hello"]
explode 行转列
===> 行转列 列转行
hive (d6_hive)> select explode(split(sentence,"\t")) from hive_wc;
OK
hello
world
hello
hello
world
welcome
hello
统计单词的个数,也可以用MapReduce编程
select word, count(1) as c
from
(select explode(split(sentence, "\t")) as word from hive_wc ) t
group by word
order by c desc;
问题:
1. null 0 “” ‘‘的区别 null 为空值,但是不一定是0;0就是为0;”“为空值;’‘表示有一个空字符串的
2. 求每个部门、工作岗位的最高薪资
select emp.deptno,ename,sal
from emp,
(select deptno,max(sal)maxsal from emp group by deptno) t
where emp.deptno=t.deptno and emp.sal=t.maxsal;
3. desc function extended abs; 中出现的案例是怎么来的呢?
hive (d6_hive)> desc function extended abs;
OK
abs(x) - returns the absolute value of x
Example:
SELECT abs(0) FROM src LIMIT 1;
0
SELECT abs(-5) FROM src LIMIT 1;
5
Time taken: 0.015 seconds, Fetched: 6 row(s)
hive (d6_hive)>
在源码中有说明
4. 求每个月月底
hive (d6_hive)> select last_day('2019-02-12') from dual;
OK
2019-02-28
Time taken: 0.034 seconds, Fetched: 1 row(s)
hive (d6_hive)>