Skip to content

Instantly share code, notes, and snippets.

@alexhanna
Last active August 29, 2017 07:02
Show Gist options
  • Save alexhanna/7492311 to your computer and use it in GitHub Desktop.
Save alexhanna/7492311 to your computer and use it in GitHub Desktop.
Creating Twitter Hive schema.
SET hive.exec.compress.output=true;
SET mapred.max.split.size=256000000;
SET mapred.output.compression.type=BLOCK;
SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
SET hive.exec.dynamic.partition.mode=nonstrict;
SET hive.exec.dynamic.partition=true;
CREATE EXTERNAL TABLE gh_raw (
id BIGINT,
created_at STRING,
source STRING,
favorited BOOLEAN,
retweet_count INT,
text STRING,
in_reply_to_screen_name STRING,
in_reply_to_status_id BIGINT,
in_reply_to_user_id BIGINT
retweeted_status STRUCT<
id BIGINT,
created_at STRING,
source STRING,
favorited BOOLEAN,
text:STRING,
in_reply_to_screen_name STRING,
in_reply_to_status_id BIGINT,
in_reply_to_user_id BIGINT,
user:STRUCT<
screen_name:STRING,
name:STRING,
friends_count:INT,
followers_count:INT,
statuses_count:INT,
verified:BOOLEAN,
utc_offset:INT,
time_zone:STRING>>,
entities STRUCT<
urls:ARRAY<STRUCT<expanded_url:STRING>>,
user_mentions:ARRAY<STRUCT<screen_name:STRING,name:STRING>>,
hashtags:ARRAY<STRUCT<text:STRING>>>,
coordinates STRUCT<
coordinates:ARRAY<>,
type:STRING
>,
user STRUCT<
screen_name:STRING,
name:STRING,
friends_count:INT,
followers_count:INT,
statuses_count:INT,
verified:BOOLEAN,
utc_offset:INT,
time_zone:STRING>,
)
PARTITIONED BY (year INT, month INT, day INT)
ROW FORMAT SERDE 'com.cloudera.hive.serde.JSONSerDe'
LOCATION '/user/ahanna/gh_raw';
LOAD DATA INPATH '/user/cloudera/elex2012' INTO TABLE `default.elex`
CREATE EXTERNAL TABLE gh (
id BIGINT,
created_at STRING,
source STRING,
favorited BOOLEAN,
retweet_count INT,
retweeted_status STRUCT<
text:STRING,
user:STRUCT<screen_name:STRING,name:STRING>>,
entities STRUCT<
urls:ARRAY<STRUCT<expanded_url:STRING>>,
user_mentions:ARRAY<STRUCT<screen_name:STRING,name:STRING>>,
hashtags:ARRAY<STRUCT<text:STRING>>>,
text STRING,
user STRUCT<
screen_name:STRING,
name:STRING,
friends_count:INT,
followers_count:INT,
statuses_count:INT,
verified:BOOLEAN,
utc_offset:INT,
time_zone:STRING>,
in_reply_to_screen_name STRING
)
PARTITIONED BY (year INT, month INT, day INT)
STORED AS rcfile
LOCATION '/user/cloudera/gh';
INSERT OVERWRITE TABLE elexrc SELECT * FROM default.elex;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment