1. text (\t 으로 구분, /data/text 폴더에 저장)
create table tb_text (
ymd String,
tag String,
cnt int
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
LOCATION '/data/text'
;
테이블 생성한뒤 hive에서 아래와 같이 실행하면 insert할때 압축한다.
SET hive.exec.compress.output=true;
SET mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec; -- gzip
SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec; -- snappy
2. parquet
create table tb_text (
ymd String,
tag String,
cnt int
)
ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'parquet.hive.DeprecatedParquetInputFormat' OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat'
LOCATION '/data/text'
;
압축은 hive 인터프리터에서 셋중에 하나 실행하면 설정이 된다.
set parquet.compression=UNCOMPRESSED;
set parquet.compression=GZIP;
set parquet.compression=SNAPPY;
3. rcfile
create table tb_text (
ymd String,
tag String,
cnt int
)
STORED AS RCFILE
LOCATION '/data/text'
;
압축은 이렇게
SET hive.exec.compress.output=true;
SET mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec; -- gzip
SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec; -- snappy
4. sequence
create table tb_text (
ymd String,
tag String,
cnt int
)
STORED AS SEQUENCEFILE
LOCATION '/data/text'
;
압축은 이렇게
SET hive.exec.compress.output=true;
SET mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec; -- gzip
SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec; -- snappy
5. orc
create table tb_text (
ymd String,
tag String,
cnt int
)
stored as orc
LOCATION '/data/text'
tblproperties ("orc.compress"="NONE")
;
압축은 위에서 아래값으로 대체하면됨.
tblproperties ("orc.compress"="ZLIB")
tblproperties ("orc.compress"="SNAPPY")