Created
May 15, 2012 10:53
-
-
Save jorisbontje/2700805 to your computer and use it in GitHub Desktop.
Twitter sentiment analysis using Apache Hive
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
drop table if exists raw_tweets; | |
drop table if exists tweets; | |
drop table if exists positive_hashtags_per_day; | |
drop table if exists count_positive_hashtags_per_day; | |
drop table if exists top5_positive_hashtags_per_day; | |
create table raw_tweets (json string); | |
load data local inpath 'sample.json' into table raw_tweets; | |
create table tweets as | |
select get_json_object(json, "$.text") as text, | |
unix_timestamp(get_json_object(json, "$.created_at"), | |
"EEE MMM d HH:mm:ss Z yyyy") as ts_created | |
from raw_tweets; | |
create table positive_hashtags_per_day as | |
select from_unixtime(ts_created, 'yyyy-MM-dd') as dt, | |
lower(hashtag) as hashtag from tweets | |
lateral view explode(split(text, ' ')) b as hashtag | |
where ts_created is not null | |
and hashtag rlike "^#[a-zA-Z0-9]+$" | |
and text rlike "^.*[\;:]-?\\).*$"; | |
create table count_positive_hashtags_per_day as | |
select dt, hashtag, count(*) as cnt from positive_hashtags_per_day | |
group by dt, hashtag; | |
add file topN.py; | |
create table top5_positive_hashtags_per_day as | |
reduce dt, hashtag, cnt | |
using 'topN.py 5' as dt, hashtag, cnt | |
from | |
(select dt, hashtag, cnt from count_positive_hashtags_per_day | |
distribute by dt sort by dt, cnt desc) cnts; | |
select * from top5_positive_hashtags_per_day; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Reducer that returns the top N results per keyword | |
import sys | |
maxN = int(sys.argv[1]) | |
last_key = None | |
count = 0 | |
for line in sys.stdin: | |
(key, value) = line.strip().split("\t", 1) | |
if key != last_key: | |
count = 0 | |
last_key = key; | |
if count < maxN: | |
print "%s\t%s" % (key, value) | |
count += 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment