Last active
January 13, 2016 18:36
-
-
Save pycckuu/145459e98c6773c09412 to your computer and use it in GitHub Desktop.
to review
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import operator | |
import nltk | |
import re | |
from dateutil import parser | |
import glob | |
often_words = ['aboard', 'about', 'above', 'across', 'after', 'against', 'along', 'amid', 'among', 'anti', 'around', 'as', 'at', 'before', 'behind', 'below', 'beneath', 'beside', 'besides', 'between', 'beyond', 'but', 'by', 'concerning', 'considering', 'despite', 'down', 'during', 'except', 'excepting', 'excluding', 'following', 'for', 'from', 'in', 'inside', 'into', 'like', 'minus', 'near', 'of', 'off', 'on', 'onto', 'opposite', 'outside', 'over', 'past', 'per', 'the', 'a', 'plus', 'regarding', 'round', 'save', 'since', 'than', 'through', 'to', 'toward', 'towards', 'under', 'underneath', 'unlike', 'until', 'up', 'upon', 'versus', 'via', 'with', 'within', 'without', 'account', 'embedded', 'permalink', 'dec', 'jan', 'retweets', 'image', 'hours', '2015', 'reply', 'you', 'is', 'are', 'am', 'was', 'were', 'will', | |
'do', 'does', 'did', 'have', 'had', 'has', 'can', 'could', 'should', 'shall', 'may', 'might', 'would', 'likes', 'retweet', 'more', '\xe2\x80\xa6', 'and', 'ago', 'what', 'what', 'when', 'when', 'why', 'why', 'which', 'who', 'how', 'how', 'how', 'whose', 'whom', 'it', 'all', 'your', '21h21', '22h22', 'verified', 'new', 'be', '-', 'that', 'this', '&', 'out', 'not', 'we', 'so', 'no', 'its', '\xe6\x9d\xb1\xe6\x96\xb9\xe7\xa5\x9e\xe8\xb5\xb7', '...', 'retweeted', '|', 'says', 'rt', 'lead', 'an', '', 'httpwwwbbccouknewsuk', 'if', 'year', 'get', 'day', 'times', 'summary', 'our', 'ho', 'i', 'added', 'now', 'york', 'been', 'gov', 'just', 'years', 'green', 'great', 'or', 'daily', 'make', 'giving', 'time', 'view', 'my', 'some', 'need', 'where', 'they', 'watch', 'use', 'high', 'help', 'police', 'seconds', 'their', 'business'] | |
fdist = nltk.FreqDist(nltk.corpus.brown.words()) | |
mostcommon = fdist.most_common(150) | |
def daily_count_words(words): | |
w_counter = [] | |
for word in words: | |
w_counter.append(count_word_in_tweets(word, prsd_dated_tweets)) | |
w_counter = reduce(lambda x, y: dict((k, v + y[k]) for k, v in x.iteritems()), w_counter) | |
return w_counter | |
def combine_df(data): | |
new = {} | |
a = data.keys() | |
for k in data[a[0]]: | |
new[k] = {} | |
for w in data: | |
for k in data[w]: | |
new[k][w] = data[w][k] | |
return new | |
def parse_date(tweet): | |
last_year_date = re.findall('(\d+\s\D{3}\s2015)', tweet) | |
if last_year_date: | |
return parser.parse(last_year_date[0]) | |
this_january_date = re.findall('\sJan\s(\d+)', tweet) | |
# TODO: This month searcher could be improved to whole this year searcher like last year one. | |
if this_january_date: | |
return parser.parse("%s Jan 2016" % (this_january_date[0])) | |
if (re.findall('([0-9]+)\shour', tweet)): # if today? | |
return parser.parse('On') | |
return None | |
def parse_tweets(tweets): | |
prsd_tweets = {} | |
for i in range(len(tweets)): | |
prsd_tweets[i] = {} | |
prsd_tweets[i]['date'] = parse_date(tweets[i]) | |
prsd_tweets[i]['text'] = tweets[i] | |
return prsd_tweets | |
def parse_tweets_by_date(tweets): | |
# {datetime: { id:'', text:'',date:'' } } | |
prsd_dated_tweets = {} | |
for i in range(len(tweets)): | |
if not parse_date(tweets[i]) in prsd_dated_tweets: | |
prsd_dated_tweets[parse_date(tweets[i])] = [tweets[i]] | |
else: | |
prsd_dated_tweets[parse_date(tweets[i])].append(tweets[i]) | |
return prsd_dated_tweets | |
def count_word_in_str(word, string): | |
return len(re.findall(word, string.lower())) | |
def count_word_in_tweets(word, tweets): | |
counter = {} | |
for k in tweets: | |
counter[k] = count_word_in_str(word, ' '.join(tweets[k])) | |
counter.pop(None, None) | |
return counter | |
def count_word_in_tweets_for_df(word): | |
counter = {'Date': [], word: []} | |
for k in prsd_dated_tweets: | |
counter['Date'].append(k) | |
messages = ' '.join(prsd_dated_tweets[k]) | |
n = count_word_in_str(word, messages) | |
counter[word].append(n) | |
return counter | |
def find_files(folder): | |
filelist = [] | |
for counter, files in enumerate(glob.glob(folder + "/*.txt")): | |
filelist.append(files) | |
print files | |
return filelist | |
def read_files(filelist): | |
text = '' | |
for fileitem in filelist: | |
f = open(fileitem, 'r') | |
temp = f.read() | |
f.close() | |
text += temp | |
return text | |
def split_text_to_tweets(text): | |
return re.split('\d\slike', text) | |
def twitter_wordcount(text, quantity): | |
regex = re.compile('[^a-zA-Z]') | |
wordcount = {} | |
for word in text.split(): | |
word = word.lower() | |
word = regex.sub('', word) | |
if word not in often_words and not word.isdigit() and word not in mostcommon: | |
if word not in wordcount: | |
wordcount[word] = 1 | |
else: | |
wordcount[word] += 1 | |
return sorted(wordcount.iteritems(), key=operator.itemgetter(1), reverse=True)[:quantity] | |
if __name__ == '__main__': | |
print 'Found files' | |
text = read_files(find_files('data')) | |
tweets = split_text_to_tweets(text) | |
print 'Total amount of tweets: %s' % len (tweets) | |
print 'Fifty most common words in tweets:' | |
print twitter_wordcount(text, 50) | |
tweets = split_text_to_tweets(text) | |
prsd_tweets = parse_tweets(tweets) | |
prsd_dated_tweets = parse_tweets_by_date(tweets) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment