Skip to content

Instantly share code, notes, and snippets.

@pycckuu
Last active January 13, 2016 18:36
Show Gist options
  • Save pycckuu/145459e98c6773c09412 to your computer and use it in GitHub Desktop.
Save pycckuu/145459e98c6773c09412 to your computer and use it in GitHub Desktop.
to review
import operator
import nltk
import re
from dateutil import parser
import glob
often_words = ['aboard', 'about', 'above', 'across', 'after', 'against', 'along', 'amid', 'among', 'anti', 'around', 'as', 'at', 'before', 'behind', 'below', 'beneath', 'beside', 'besides', 'between', 'beyond', 'but', 'by', 'concerning', 'considering', 'despite', 'down', 'during', 'except', 'excepting', 'excluding', 'following', 'for', 'from', 'in', 'inside', 'into', 'like', 'minus', 'near', 'of', 'off', 'on', 'onto', 'opposite', 'outside', 'over', 'past', 'per', 'the', 'a', 'plus', 'regarding', 'round', 'save', 'since', 'than', 'through', 'to', 'toward', 'towards', 'under', 'underneath', 'unlike', 'until', 'up', 'upon', 'versus', 'via', 'with', 'within', 'without', 'account', 'embedded', 'permalink', 'dec', 'jan', 'retweets', 'image', 'hours', '2015', 'reply', 'you', 'is', 'are', 'am', 'was', 'were', 'will',
'do', 'does', 'did', 'have', 'had', 'has', 'can', 'could', 'should', 'shall', 'may', 'might', 'would', 'likes', 'retweet', 'more', '\xe2\x80\xa6', 'and', 'ago', 'what', 'what', 'when', 'when', 'why', 'why', 'which', 'who', 'how', 'how', 'how', 'whose', 'whom', 'it', 'all', 'your', '21h21', '22h22', 'verified', 'new', 'be', '-', 'that', 'this', '&', 'out', 'not', 'we', 'so', 'no', 'its', '\xe6\x9d\xb1\xe6\x96\xb9\xe7\xa5\x9e\xe8\xb5\xb7', '...', 'retweeted', '|', 'says', 'rt', 'lead', 'an', '', 'httpwwwbbccouknewsuk', 'if', 'year', 'get', 'day', 'times', 'summary', 'our', 'ho', 'i', 'added', 'now', 'york', 'been', 'gov', 'just', 'years', 'green', 'great', 'or', 'daily', 'make', 'giving', 'time', 'view', 'my', 'some', 'need', 'where', 'they', 'watch', 'use', 'high', 'help', 'police', 'seconds', 'their', 'business']
fdist = nltk.FreqDist(nltk.corpus.brown.words())
mostcommon = fdist.most_common(150)
def daily_count_words(words):
w_counter = []
for word in words:
w_counter.append(count_word_in_tweets(word, prsd_dated_tweets))
w_counter = reduce(lambda x, y: dict((k, v + y[k]) for k, v in x.iteritems()), w_counter)
return w_counter
def combine_df(data):
new = {}
a = data.keys()
for k in data[a[0]]:
new[k] = {}
for w in data:
for k in data[w]:
new[k][w] = data[w][k]
return new
def parse_date(tweet):
last_year_date = re.findall('(\d+\s\D{3}\s2015)', tweet)
if last_year_date:
return parser.parse(last_year_date[0])
this_january_date = re.findall('\sJan\s(\d+)', tweet)
# TODO: This month searcher could be improved to whole this year searcher like last year one.
if this_january_date:
return parser.parse("%s Jan 2016" % (this_january_date[0]))
if (re.findall('([0-9]+)\shour', tweet)): # if today?
return parser.parse('On')
return None
def parse_tweets(tweets):
prsd_tweets = {}
for i in range(len(tweets)):
prsd_tweets[i] = {}
prsd_tweets[i]['date'] = parse_date(tweets[i])
prsd_tweets[i]['text'] = tweets[i]
return prsd_tweets
def parse_tweets_by_date(tweets):
# {datetime: { id:'', text:'',date:'' } }
prsd_dated_tweets = {}
for i in range(len(tweets)):
if not parse_date(tweets[i]) in prsd_dated_tweets:
prsd_dated_tweets[parse_date(tweets[i])] = [tweets[i]]
else:
prsd_dated_tweets[parse_date(tweets[i])].append(tweets[i])
return prsd_dated_tweets
def count_word_in_str(word, string):
return len(re.findall(word, string.lower()))
def count_word_in_tweets(word, tweets):
counter = {}
for k in tweets:
counter[k] = count_word_in_str(word, ' '.join(tweets[k]))
counter.pop(None, None)
return counter
def count_word_in_tweets_for_df(word):
counter = {'Date': [], word: []}
for k in prsd_dated_tweets:
counter['Date'].append(k)
messages = ' '.join(prsd_dated_tweets[k])
n = count_word_in_str(word, messages)
counter[word].append(n)
return counter
def find_files(folder):
filelist = []
for counter, files in enumerate(glob.glob(folder + "/*.txt")):
filelist.append(files)
print files
return filelist
def read_files(filelist):
text = ''
for fileitem in filelist:
f = open(fileitem, 'r')
temp = f.read()
f.close()
text += temp
return text
def split_text_to_tweets(text):
return re.split('\d\slike', text)
def twitter_wordcount(text, quantity):
regex = re.compile('[^a-zA-Z]')
wordcount = {}
for word in text.split():
word = word.lower()
word = regex.sub('', word)
if word not in often_words and not word.isdigit() and word not in mostcommon:
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
return sorted(wordcount.iteritems(), key=operator.itemgetter(1), reverse=True)[:quantity]
if __name__ == '__main__':
print 'Found files'
text = read_files(find_files('data'))
tweets = split_text_to_tweets(text)
print 'Total amount of tweets: %s' % len (tweets)
print 'Fifty most common words in tweets:'
print twitter_wordcount(text, 50)
tweets = split_text_to_tweets(text)
prsd_tweets = parse_tweets(tweets)
prsd_dated_tweets = parse_tweets_by_date(tweets)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment