Last active
August 29, 2017 07:02
-
-
Save alexhanna/7492311 to your computer and use it in GitHub Desktop.
Creating Twitter Hive schema.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SET hive.exec.compress.output=true; | |
SET mapred.max.split.size=256000000; | |
SET mapred.output.compression.type=BLOCK; | |
SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec; | |
SET hive.exec.dynamic.partition.mode=nonstrict; | |
SET hive.exec.dynamic.partition=true; | |
CREATE EXTERNAL TABLE gh_raw ( | |
id BIGINT, | |
created_at STRING, | |
source STRING, | |
favorited BOOLEAN, | |
retweet_count INT, | |
text STRING, | |
in_reply_to_screen_name STRING, | |
in_reply_to_status_id BIGINT, | |
in_reply_to_user_id BIGINT | |
retweeted_status STRUCT< | |
id BIGINT, | |
created_at STRING, | |
source STRING, | |
favorited BOOLEAN, | |
text:STRING, | |
in_reply_to_screen_name STRING, | |
in_reply_to_status_id BIGINT, | |
in_reply_to_user_id BIGINT, | |
user:STRUCT< | |
screen_name:STRING, | |
name:STRING, | |
friends_count:INT, | |
followers_count:INT, | |
statuses_count:INT, | |
verified:BOOLEAN, | |
utc_offset:INT, | |
time_zone:STRING>>, | |
entities STRUCT< | |
urls:ARRAY<STRUCT<expanded_url:STRING>>, | |
user_mentions:ARRAY<STRUCT<screen_name:STRING,name:STRING>>, | |
hashtags:ARRAY<STRUCT<text:STRING>>>, | |
coordinates STRUCT< | |
coordinates:ARRAY<>, | |
type:STRING | |
>, | |
user STRUCT< | |
screen_name:STRING, | |
name:STRING, | |
friends_count:INT, | |
followers_count:INT, | |
statuses_count:INT, | |
verified:BOOLEAN, | |
utc_offset:INT, | |
time_zone:STRING>, | |
) | |
PARTITIONED BY (year INT, month INT, day INT) | |
ROW FORMAT SERDE 'com.cloudera.hive.serde.JSONSerDe' | |
LOCATION '/user/ahanna/gh_raw'; | |
LOAD DATA INPATH '/user/cloudera/elex2012' INTO TABLE `default.elex` | |
CREATE EXTERNAL TABLE gh ( | |
id BIGINT, | |
created_at STRING, | |
source STRING, | |
favorited BOOLEAN, | |
retweet_count INT, | |
retweeted_status STRUCT< | |
text:STRING, | |
user:STRUCT<screen_name:STRING,name:STRING>>, | |
entities STRUCT< | |
urls:ARRAY<STRUCT<expanded_url:STRING>>, | |
user_mentions:ARRAY<STRUCT<screen_name:STRING,name:STRING>>, | |
hashtags:ARRAY<STRUCT<text:STRING>>>, | |
text STRING, | |
user STRUCT< | |
screen_name:STRING, | |
name:STRING, | |
friends_count:INT, | |
followers_count:INT, | |
statuses_count:INT, | |
verified:BOOLEAN, | |
utc_offset:INT, | |
time_zone:STRING>, | |
in_reply_to_screen_name STRING | |
) | |
PARTITIONED BY (year INT, month INT, day INT) | |
STORED AS rcfile | |
LOCATION '/user/cloudera/gh'; | |
INSERT OVERWRITE TABLE elexrc SELECT * FROM default.elex; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment