Hive内部表和外部表

Posted by JustDoDT on March 25, 2018

创建数据库

CREATE DATABASE hive;
desc database

在MySQL中查看元数据信息

mysql> select * from dbs\G;
********* 1. row *********
          DB_ID: 1
           DESC: Default Hive database
DB_LOCATION_URI: hdfs://localhost:9000/user/hive/warehouse
           NAME: default
     OWNER_NAME: public
     OWNER_TYPE: ROLE
********* 2. row *********
          DB_ID: 6
           DESC: NULL
DB_LOCATION_URI: hdfs://localhost:9000/user/hive/warehouse/hive.db
           NAME: hive
     OWNER_NAME: hadoop
     OWNER_TYPE: USER
********* 3. row *********
          DB_ID: 7
           DESC: NULL
DB_LOCATION_URI: hdfs://localhost:9000/d6_hive/test
           NAME: hive2
     OWNER_NAME: hadoop
     OWNER_TYPE: USER
********* 4. row *********
          DB_ID: 11
           DESC: NULL
DB_LOCATION_URI: hdfs://localhost:9000/user/hive/warehouse/testdb.db
           NAME: testdb
     OWNER_NAME: hadoop
     OWNER_TYPE: USER
********* 5. row *********
          DB_ID: 12
           DESC: NULL
DB_LOCATION_URI: hdfs://localhost:9000/user/hive/warehouse/d6_hive.db
           NAME: d6_hive
     OWNER_NAME: hadoop
     OWNER_TYPE: USER
5 rows in set (0.00 sec)

ERROR: 
No query specified

mysql> 

修改hive参数的几种形式

1) set hive.metastore.warehouse.dir;
	set key 取值
	set key=value   设置值
	局部  单session
2)hive-site.xml
	全局

set xxxx=yyyy
....	
set xxxx=zzzz

hive.db  
	create database hive;  hive是db的名称.db

CREATE DATABASE hive2
location '/d6_hive/directory';

row format/ file format
	两大分隔符:行与行 字段与字段之间的分隔符
		列分隔符:\001
	file格式:行式  列式

数值类型: int bigint float double DECIMAL	
字符串:string

true/false   0/1

创建表

create table emp(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double,
deptno int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';

加载数据

LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename ;
LOCAL: 从本地(linux)加载数据	
	
LOAD DATA LOCAL INPATH '/home/hadoop/data/emp.txt' OVERWRITE INTO TABLE emp ;

拷贝表结构

CREATE TABLE emp2
LIKE emp;

拷贝表结构和数据

create table emp3 as select * from emp;

查看表结构

官网1 官网2

desc formatted emp3;  #推荐使用
desc extended emp3;
desc emp3;

修改表名

ALTER TABLE emp3 RENAME TO new_emp3;

官网

删除表

官网

drop table new_emp3;      

内部表/外部表

官网

官网对比

官网

内部表 Table Type: MANAGED_TABLE

hive default: MANAGED_TABLE hive默认的是内部表 EXTERNAL

创建内部表

create table managed_emp(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double,
deptno int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';

在mysql中

mysql> select * from tbls\G;
********* 1. row *********
            TBL_ID: 1
       CREATE_TIME: 1552944715
             DB_ID: 1
  LAST_ACCESS_TIME: 0
             OWNER: hadoop
         RETENTION: 0
             SD_ID: 1
          TBL_NAME: d6_wc
          TBL_TYPE: MANAGED_TABLE
VIEW_EXPANDED_TEXT: NULL
VIEW_ORIGINAL_TEXT: NULL
********* 2. row *********
            TBL_ID: 6
       CREATE_TIME: 1552961328
             DB_ID: 12
  LAST_ACCESS_TIME: 0
             OWNER: hadoop
         RETENTION: 0
             SD_ID: 6
          TBL_NAME: emp
          TBL_TYPE: MANAGED_TABLE
VIEW_EXPANDED_TEXT: NULL
VIEW_ORIGINAL_TEXT: NULL
********* 3. row *********
            TBL_ID: 11
       CREATE_TIME: 1553271846
             DB_ID: 12
  LAST_ACCESS_TIME: 0
             OWNER: hadoop
         RETENTION: 0
             SD_ID: 11
          TBL_NAME: emp2
          TBL_TYPE: MANAGED_TABLE
VIEW_EXPANDED_TEXT: NULL
VIEW_ORIGINAL_TEXT: NULL
********* 4. row *********
            TBL_ID: 14
       CREATE_TIME: 1553274632
             DB_ID: 12
  LAST_ACCESS_TIME: 0
             OWNER: hadoop
         RETENTION: 0
             SD_ID: 14
          TBL_NAME: managed_emp
          TBL_TYPE: MANAGED_TABLE
VIEW_EXPANDED_TEXT: NULL
VIEW_ORIGINAL_TEXT: NULL
4 rows in set (0.00 sec)

ERROR: 
No query specified

mysql> 

删除内部表:数据+元数据 删除 hive (d6_hive)> drop table managed_emp;

在MySQL中

mysql> select * from tbls\G;
********* 1. row *********
            TBL_ID: 1
       CREATE_TIME: 1552944715
             DB_ID: 1
  LAST_ACCESS_TIME: 0
             OWNER: hadoop
         RETENTION: 0
             SD_ID: 1
          TBL_NAME: d6_wc
          TBL_TYPE: MANAGED_TABLE
VIEW_EXPANDED_TEXT: NULL
VIEW_ORIGINAL_TEXT: NULL
********* 2. row *********
            TBL_ID: 6
       CREATE_TIME: 1552961328
             DB_ID: 12
  LAST_ACCESS_TIME: 0
             OWNER: hadoop
         RETENTION: 0
             SD_ID: 6
          TBL_NAME: emp
          TBL_TYPE: MANAGED_TABLE
VIEW_EXPANDED_TEXT: NULL
VIEW_ORIGINAL_TEXT: NULL
********* 3. row *********
            TBL_ID: 11
       CREATE_TIME: 1553271846
             DB_ID: 12
  LAST_ACCESS_TIME: 0
             OWNER: hadoop
         RETENTION: 0
             SD_ID: 11
          TBL_NAME: emp2
          TBL_TYPE: MANAGED_TABLE
VIEW_EXPANDED_TEXT: NULL
VIEW_ORIGINAL_TEXT: NULL
3 rows in set (0.00 sec)

创建外部表

官网

create EXTERNAL table emp_external(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double,
deptno int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
location '/d6_hive/external';	

删除外部表:数据不删,元数据删

hive (d6_hive)> drop table emp_external;

在MySQL中发现元数据已经删除了

在hive-site.xml文件中添加如下

<!--在hive的cli打印当前库-->
<property>
   <name>hive.cli.print.current.db</name>
   <value>true</value>
</property>

<!--在hive的cli打印当前表的字段名-->
<property>
   <name>hive.cli.print.header</name>
   <value>true</value>
</property>

在hive的官网中的配置文件中有详细说明

官网

内部表和外部的区别以及使用场景

未被external修饰的是内部表(managed table),被external 修饰的外部表(external table);

区别:

内部表数据由Hive自身管理,外部表数据由HDFS管理;

内部表数据存储的位置是hive.metastore.warehouse.dir(默认:/usr/hive/warehouse),外部表数据的存储位置由自己制定。

删除内部表会直接删除元数据(metadata)及存储数据;删除外部表仅仅会删除元数据,HDFS上的文件并不会被删除;

对内部表的修改会将修改直接同步给元数据,而对外部表的表结构和分区进行修改,则需要修复(MSCK REPAIR TABLE table_name;)

使用场景:其他部门共有的,可以保证数据安全