Skip to content

Instantly share code, notes, and snippets.

@alexhanna
Created October 28, 2017 14:30
Show Gist options
  • Save alexhanna/5af42b05188e77ae30b20fc0ebfe8df3 to your computer and use it in GitHub Desktop.
Save alexhanna/5af42b05188e77ae30b20fc0ebfe8df3 to your computer and use it in GitHub Desktop.
Sample Hive example
insert overwrite local directory '/scratch.1/sample2013_1'
row format delimited
fields terminated by "\t"
select id_str, created_at, regexp_replace(text, "[ \t\r\n]+", " "), user.id_str, regexp_replace(user.name, "[ \t\r\n]+", " "), user.screen_name, retweeted_status.id_str, retweeted_status.created_at, regexp_replace(retweeted_status.text, "[ \t\r\n]+", " "), retweeted_status.user.id_str, regexp_replace(retweeted_status.user.name, "[ \t\r\n]+", " "), retweeted_status.user.screen_name
from gh_rc TABLESAMPLE (10 PERCENT)
WHERE year = 2013 and month = 1;
insert overwrite local directory '/scratch.1/sample2013_2'
row format delimited
fields terminated by "\t"
select id_str, created_at, regexp_replace(text, "[ \t\r\n]+", " "), user.id_str, regexp_replace(user.name, "[ \t\r\n]+", " "), user.screen_name, retweeted_status.id_str, retweeted_status.created_at, regexp_replace(retweeted_status.text, "[ \t\r\n]+", " "), retweeted_status.user.id_str, regexp_replace(retweeted_status.user.name, "[ \t\r\n]+", " "), retweeted_status.user.screen_name
from gh_rc TABLESAMPLE (10 PERCENT)
WHERE year = 2013 and month = 2;
insert overwrite local directory '/scratch.1/sample2013_3'
row format delimited
fields terminated by "\t"
select id_str, created_at, regexp_replace(text, "[ \t\r\n]+", " "), user.id_str, regexp_replace(user.name, "[ \t\r\n]+", " "), user.screen_name, retweeted_status.id_str, retweeted_status.created_at, regexp_replace(retweeted_status.text, "[ \t\r\n]+", " "), retweeted_status.user.id_str, regexp_replace(retweeted_status.user.name, "[ \t\r\n]+", " "), retweeted_status.user.screen_name
from gh_rc TABLESAMPLE (10 PERCENT)
WHERE year = 2013 and month = 3;
insert overwrite local directory '/scratch.1/sample2013_4'
row format delimited
fields terminated by "\t"
select id_str, created_at, regexp_replace(text, "[ \t\r\n]+", " "), user.id_str, regexp_replace(user.name, "[ \t\r\n]+", " "), user.screen_name, retweeted_status.id_str, retweeted_status.created_at, regexp_replace(retweeted_status.text, "[ \t\r\n]+", " "), retweeted_status.user.id_str, regexp_replace(retweeted_status.user.name, "[ \t\r\n]+", " "), retweeted_status.user.screen_name
from gh_rc TABLESAMPLE (10 PERCENT)
WHERE year = 2013 and month = 4;
insert overwrite local directory '/scratch.1/sample2013_5'
row format delimited
fields terminated by "\t"
select id_str, created_at, regexp_replace(text, "[ \t\r\n]+", " "), user.id_str, regexp_replace(user.name, "[ \t\r\n]+", " "), user.screen_name, retweeted_status.id_str, retweeted_status.created_at, regexp_replace(retweeted_status.text, "[ \t\r\n]+", " "), retweeted_status.user.id_str, regexp_replace(retweeted_status.user.name, "[ \t\r\n]+", " "), retweeted_status.user.screen_name
from gh_rc TABLESAMPLE (10 PERCENT)
WHERE year = 2013 and month = 5;
insert overwrite local directory '/scratch.1/sample2013_6'
row format delimited
fields terminated by "\t"
select id_str, created_at, regexp_replace(text, "[ \t\r\n]+", " "), user.id_str, regexp_replace(user.name, "[ \t\r\n]+", " "), user.screen_name, retweeted_status.id_str, retweeted_status.created_at, regexp_replace(retweeted_status.text, "[ \t\r\n]+", " "), retweeted_status.user.id_str, regexp_replace(retweeted_status.user.name, "[ \t\r\n]+", " "), retweeted_status.user.screen_name
from gh_rc TABLESAMPLE (10 PERCENT)
WHERE year = 2013 and month = 6;
insert overwrite local directory '/scratch.1/sample2013_7'
row format delimited
fields terminated by "\t"
select id_str, created_at, regexp_replace(text, "[ \t\r\n]+", " "), user.id_str, regexp_replace(user.name, "[ \t\r\n]+", " "), user.screen_name, retweeted_status.id_str, retweeted_status.created_at, regexp_replace(retweeted_status.text, "[ \t\r\n]+", " "), retweeted_status.user.id_str, regexp_replace(retweeted_status.user.name, "[ \t\r\n]+", " "), retweeted_status.user.screen_name
from gh_rc TABLESAMPLE (10 PERCENT)
WHERE year = 2013 and month = 7;
insert overwrite local directory '/scratch.1/sample2013_8'
row format delimited
fields terminated by "\t"
select id_str, created_at, regexp_replace(text, "[ \t\r\n]+", " "), user.id_str, regexp_replace(user.name, "[ \t\r\n]+", " "), user.screen_name, retweeted_status.id_str, retweeted_status.created_at, regexp_replace(retweeted_status.text, "[ \t\r\n]+", " "), retweeted_status.user.id_str, regexp_replace(retweeted_status.user.name, "[ \t\r\n]+", " "), retweeted_status.user.screen_name
from gh_rc2 TABLESAMPLE (10 PERCENT)
WHERE year = 2013 and month = 8;
insert overwrite local directory '/scratch.1/sample2013_9'
row format delimited
fields terminated by "\t"
select id_str, created_at, regexp_replace(text, "[ \t\r\n]+", " "), user.id_str, regexp_replace(user.name, "[ \t\r\n]+", " "), user.screen_name, retweeted_status.id_str, retweeted_status.created_at, regexp_replace(retweeted_status.text, "[ \t\r\n]+", " "), retweeted_status.user.id_str, regexp_replace(retweeted_status.user.name, "[ \t\r\n]+", " "), retweeted_status.user.screen_name
from gh_rc2 TABLESAMPLE (10 PERCENT)
WHERE year = 2013 and month = 9;
insert overwrite local directory '/scratch.1/sample2013_10'
row format delimited
fields terminated by "\t"
select id_str, created_at, regexp_replace(text, "[ \t\r\n]+", " "), user.id_str, regexp_replace(user.name, "[ \t\r\n]+", " "), user.screen_name, retweeted_status.id_str, retweeted_status.created_at, regexp_replace(retweeted_status.text, "[ \t\r\n]+", " "), retweeted_status.user.id_str, regexp_replace(retweeted_status.user.name, "[ \t\r\n]+", " "), retweeted_status.user.screen_name
from gh_rc2 TABLESAMPLE (10 PERCENT)
WHERE year = 2013 and month = 10;
insert overwrite local directory '/scratch.1/sample2013_11'
row format delimited
fields terminated by "\t"
select id_str, created_at, regexp_replace(text, "[ \t\r\n]+", " "), user.id_str, regexp_replace(user.name, "[ \t\r\n]+", " "), user.screen_name, retweeted_status.id_str, retweeted_status.created_at, regexp_replace(retweeted_status.text, "[ \t\r\n]+", " "), retweeted_status.user.id_str, regexp_replace(retweeted_status.user.name, "[ \t\r\n]+", " "), retweeted_status.user.screen_name
from gh_rc2 TABLESAMPLE (10 PERCENT)
WHERE year = 2013 and month = 11;
insert overwrite local directory '/scratch.1/sample2013_12'
row format delimited
fields terminated by "\t"
select id_str, created_at, regexp_replace(text, "[ \t\r\n]+", " "), user.id_str, regexp_replace(user.name, "[ \t\r\n]+", " "), user.screen_name, retweeted_status.id_str, retweeted_status.created_at, regexp_replace(retweeted_status.text, "[ \t\r\n]+", " "), retweeted_status.user.id_str, regexp_replace(retweeted_status.user.name, "[ \t\r\n]+", " "), retweeted_status.user.screen_name
from gh_rc2 TABLESAMPLE (10 PERCENT)
WHERE year = 2013 and month = 12;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment