Skip to content

Instantly share code, notes, and snippets.

@jorisbontje
Created May 15, 2012 10:53
Show Gist options
  • Save jorisbontje/2700805 to your computer and use it in GitHub Desktop.
Save jorisbontje/2700805 to your computer and use it in GitHub Desktop.
Twitter sentiment analysis using Apache Hive
drop table if exists raw_tweets;
drop table if exists tweets;
drop table if exists positive_hashtags_per_day;
drop table if exists count_positive_hashtags_per_day;
drop table if exists top5_positive_hashtags_per_day;
create table raw_tweets (json string);
load data local inpath 'sample.json' into table raw_tweets;
create table tweets as
select get_json_object(json, "$.text") as text,
unix_timestamp(get_json_object(json, "$.created_at"),
"EEE MMM d HH:mm:ss Z yyyy") as ts_created
from raw_tweets;
create table positive_hashtags_per_day as
select from_unixtime(ts_created, 'yyyy-MM-dd') as dt,
lower(hashtag) as hashtag from tweets
lateral view explode(split(text, ' ')) b as hashtag
where ts_created is not null
and hashtag rlike "^#[a-zA-Z0-9]+$"
and text rlike "^.*[\;:]-?\\).*$";
create table count_positive_hashtags_per_day as
select dt, hashtag, count(*) as cnt from positive_hashtags_per_day
group by dt, hashtag;
add file topN.py;
create table top5_positive_hashtags_per_day as
reduce dt, hashtag, cnt
using 'topN.py 5' as dt, hashtag, cnt
from
(select dt, hashtag, cnt from count_positive_hashtags_per_day
distribute by dt sort by dt, cnt desc) cnts;
select * from top5_positive_hashtags_per_day;
#!/usr/bin/env python
# Reducer that returns the top N results per keyword
import sys
maxN = int(sys.argv[1])
last_key = None
count = 0
for line in sys.stdin:
(key, value) = line.strip().split("\t", 1)
if key != last_key:
count = 0
last_key = key;
if count < maxN:
print "%s\t%s" % (key, value)
count += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment