Created
May 20, 2013 19:46
-
-
Save bmentges/5614916 to your computer and use it in GitHub Desktop.
My frequency implementation for Data Science in coursera.org
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
import types | |
import sys | |
import json | |
import re | |
from collections import defaultdict | |
regex = re.compile('[%s]' % re.escape('!"#$%&()*+,-./:;<=>?[\\]^_{|}~')) | |
def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'): | |
if strings_only and isinstance(s, (types.NoneType, int)): | |
return s | |
elif not isinstance(s, basestring): | |
try: | |
return str(s) | |
except UnicodeEncodeError: | |
if isinstance(s, Exception): | |
return ' '.join([smart_str(arg, encoding, strings_only, | |
errors) for arg in s]) | |
return unicode(s).encode(encoding, errors) | |
elif isinstance(s, unicode): | |
return s.encode(encoding, errors) | |
elif s and encoding != 'utf-8': | |
return s.decode('utf-8', errors).encode(encoding, errors) | |
else: | |
return s | |
class Tweet(): | |
def __init__(self, raw_tweet): | |
self.raw_tweet = json.loads(raw_tweet) | |
def is_tt(self): | |
if "text" in self.raw_tweet: | |
return True | |
return False | |
def get_tweet_ws(self): | |
if self.is_tt(): | |
text = smart_str(self.raw_tweet['text']) | |
text = self._rp(text) | |
ws = text.replace("\n", " ").split() | |
ws = [x for x in ws if x] | |
return ws | |
else: | |
return [] | |
def _rp(self, s): | |
return regex.sub('', s) | |
class FrequencyEngine(): | |
def __init__(self, tweet_file): | |
self.tweet_file = tweet_file | |
self.word_frequency = {"total_words": 0, "words": defaultdict(int)} | |
def reduce_words(self, accumulated, word): | |
accumulated["total_words"] += 1 | |
accumulated["words"][word] += 1 | |
return accumulated | |
def reduce_tweets(self, accumulated, tweet): | |
tw = Tweet(tweet) | |
words = tw.get_tweet_ws() | |
accumulated = reduce(self.reduce_words, words, accumulated) | |
return accumulated | |
def reduce_frequency(self, accumulated, word): | |
total = float(accumulated["total_words"]) | |
encountered = float(accumulated["words"][word]) | |
accumulated["words"][word] = encountered / total | |
return accumulated | |
def compute_frequency(self): | |
words = reduce(self.reduce_tweets, self.tweet_file, self.word_frequency) | |
frequency = reduce(self.reduce_frequency, words["words"].keys(), words) | |
return frequency | |
def main(): | |
tweet_file = open(sys.argv[1]) | |
engine = FrequencyEngine(tweet_file) | |
fq_stmt = engine.compute_frequency() | |
for word in fq_stmt["words"].keys(): | |
for w in word.split(): | |
print "%s\t%.3f" % (w, fq_stmt["words"][word]) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment