pycckuu · January 13, 2016 18:36
diff --git a/parsing_twts.py b/parsing_twts.py
 import operator
 import nltk
 import re
 from dateutil import parser
 import glob

 often_words = ['aboard', 'about', 'above', 'across', 'after', 'against', 'along', 'amid', 'among', 'anti', 'around', 'as', 'at', 'before', 'behind', 'below', 'beneath', 'beside', 'besides', 'between', 'beyond', 'but', 'by', 'concerning', 'considering', 'despite', 'down', 'during', 'except', 'excepting', 'excluding', 'following', 'for', 'from', 'in', 'inside', 'into', 'like', 'minus', 'near', 'of', 'off', 'on', 'onto', 'opposite', 'outside', 'over', 'past', 'per', 'the', 'a', 'plus', 'regarding', 'round', 'save', 'since', 'than', 'through', 'to', 'toward', 'towards', 'under', 'underneath', 'unlike', 'until', 'up', 'upon', 'versus', 'via', 'with', 'within', 'without', 'account', 'embedded', 'permalink', 'dec', 'jan', 'retweets', 'image', 'hours', '2015', 'reply', 'you', 'is', 'are', 'am', 'was', 'were', 'will',
               'do', 'does', 'did', 'have', 'had', 'has', 'can', 'could', 'should', 'shall', 'may', 'might', 'would', 'likes', 'retweet', 'more', '\xe2\x80\xa6', 'and', 'ago', 'what', 'what', 'when', 'when', 'why', 'why', 'which', 'who', 'how', 'how', 'how', 'whose', 'whom', 'it', 'all', 'your', '21h21', '22h22', 'verified', 'new', 'be', '-', 'that', 'this', '&', 'out', 'not', 'we', 'so', 'no', 'its', '\xe6\x9d\xb1\xe6\x96\xb9\xe7\xa5\x9e\xe8\xb5\xb7', '...', 'retweeted', '|', 'says', 'rt', 'lead', 'an', '', 'httpwwwbbccouknewsuk', 'if', 'year', 'get', 'day', 'times', 'summary', 'our', 'ho', 'i', 'added', 'now', 'york', 'been', 'gov', 'just', 'years', 'green', 'great', 'or', 'daily', 'make', 'giving', 'time', 'view', 'my', 'some', 'need', 'where', 'they', 'watch', 'use', 'high', 'help', 'police', 'seconds', 'their', 'business']

 fdist = nltk.FreqDist(nltk.corpus.brown.words())
 mostcommon = fdist.most_common(150)

 def daily_count_words(words):
    w_counter = []
    for word in words:
        w_counter.append(count_word_in_tweets(word, prsd_dated_tweets))
    w_counter = reduce(lambda x, y: dict((k, v + y[k]) for k, v in x.iteritems()), w_counter)
    return w_counter


 def combine_df(data):
    new = {}
    a = data.keys()
    for k in data[a[0]]:
        new[k] = {}
    for w in data:
        for k in data[w]:
            new[k][w] = data[w][k]
    return new


 def parse_date(tweet):
    last_year_date = re.findall('(\d+\s\D{3}\s2015)', tweet)
    if last_year_date:
        return parser.parse(last_year_date[0])
    this_january_date = re.findall('\sJan\s(\d+)', tweet)
    # TODO: This month searcher could be improved to whole this year searcher like last year one.
    if this_january_date:
        return parser.parse("%s Jan 2016" % (this_january_date[0]))
    if (re.findall('([0-9]+)\shour', tweet)):  # if today?
        return parser.parse('On')
    return None


 def parse_tweets(tweets):
    prsd_tweets = {}
    for i in range(len(tweets)):
        prsd_tweets[i] = {}
        prsd_tweets[i]['date'] = parse_date(tweets[i])
        prsd_tweets[i]['text'] = tweets[i]
    return prsd_tweets


 def parse_tweets_by_date(tweets):
    # {datetime: { id:'', text:'',date:'' } }
    prsd_dated_tweets = {}
    for i in range(len(tweets)):
        if not parse_date(tweets[i]) in prsd_dated_tweets:
            prsd_dated_tweets[parse_date(tweets[i])] = [tweets[i]]
        else:
            prsd_dated_tweets[parse_date(tweets[i])].append(tweets[i])
    return prsd_dated_tweets


 def count_word_in_str(word, string):
    return len(re.findall(word, string.lower()))


 def count_word_in_tweets(word, tweets):
    counter = {}
    for k in tweets:
        counter[k] = count_word_in_str(word, ' '.join(tweets[k]))
    counter.pop(None, None)
    return counter


 def count_word_in_tweets_for_df(word):
    counter = {'Date': [], word: []}
    for k in prsd_dated_tweets:
        counter['Date'].append(k)
        messages = ' '.join(prsd_dated_tweets[k])
        n = count_word_in_str(word, messages)
        counter[word].append(n)
    return counter


 def find_files(folder):
    filelist = []
    for counter, files in enumerate(glob.glob(folder + "/*.txt")):
        filelist.append(files)
        print files
    return filelist


 def read_files(filelist):
    text = ''
    for fileitem in filelist:
        f = open(fileitem, 'r')
        temp = f.read()
        f.close()
        text += temp
    return text


 def split_text_to_tweets(text):
    return re.split('\d\slike', text)


 def twitter_wordcount(text, quantity):
    regex = re.compile('[^a-zA-Z]')
    wordcount = {}
    for word in text.split():
        word = word.lower()
        word = regex.sub('', word)
        if word not in often_words and not word.isdigit() and word not in mostcommon:
            if word not in wordcount:
                wordcount[word] = 1
            else:
                wordcount[word] += 1

    return sorted(wordcount.iteritems(), key=operator.itemgetter(1), reverse=True)[:quantity]

 if __name__ == '__main__':
    print 'Found files'
    text = read_files(find_files('data'))
    tweets = split_text_to_tweets(text)
    print 'Total amount of tweets: %s' % len (tweets)
    print 'Fifty most common words in tweets:'
    print twitter_wordcount(text, 50)
    tweets = split_text_to_tweets(text)
    prsd_tweets = parse_tweets(tweets)
    prsd_dated_tweets = parse_tweets_by_date(tweets)
	import operator
	import nltk
	import re
	from dateutil import parser
	import glob

	often_words = ['aboard', 'about', 'above', 'across', 'after', 'against', 'along', 'amid', 'among', 'anti', 'around', 'as', 'at', 'before', 'behind', 'below', 'beneath', 'beside', 'besides', 'between', 'beyond', 'but', 'by', 'concerning', 'considering', 'despite', 'down', 'during', 'except', 'excepting', 'excluding', 'following', 'for', 'from', 'in', 'inside', 'into', 'like', 'minus', 'near', 'of', 'off', 'on', 'onto', 'opposite', 'outside', 'over', 'past', 'per', 'the', 'a', 'plus', 'regarding', 'round', 'save', 'since', 'than', 'through', 'to', 'toward', 'towards', 'under', 'underneath', 'unlike', 'until', 'up', 'upon', 'versus', 'via', 'with', 'within', 'without', 'account', 'embedded', 'permalink', 'dec', 'jan', 'retweets', 'image', 'hours', '2015', 'reply', 'you', 'is', 'are', 'am', 'was', 'were', 'will',
	'do', 'does', 'did', 'have', 'had', 'has', 'can', 'could', 'should', 'shall', 'may', 'might', 'would', 'likes', 'retweet', 'more', '\xe2\x80\xa6', 'and', 'ago', 'what', 'what', 'when', 'when', 'why', 'why', 'which', 'who', 'how', 'how', 'how', 'whose', 'whom', 'it', 'all', 'your', '21h21', '22h22', 'verified', 'new', 'be', '-', 'that', 'this', '&', 'out', 'not', 'we', 'so', 'no', 'its', '\xe6\x9d\xb1\xe6\x96\xb9\xe7\xa5\x9e\xe8\xb5\xb7', '...', 'retweeted', '\|', 'says', 'rt', 'lead', 'an', '', 'httpwwwbbccouknewsuk', 'if', 'year', 'get', 'day', 'times', 'summary', 'our', 'ho', 'i', 'added', 'now', 'york', 'been', 'gov', 'just', 'years', 'green', 'great', 'or', 'daily', 'make', 'giving', 'time', 'view', 'my', 'some', 'need', 'where', 'they', 'watch', 'use', 'high', 'help', 'police', 'seconds', 'their', 'business']

	fdist = nltk.FreqDist(nltk.corpus.brown.words())
	mostcommon = fdist.most_common(150)

	def daily_count_words(words):
	w_counter = []
	for word in words:
	w_counter.append(count_word_in_tweets(word, prsd_dated_tweets))
	w_counter = reduce(lambda x, y: dict((k, v + y[k]) for k, v in x.iteritems()), w_counter)
	return w_counter


	def combine_df(data):
	new = {}
	a = data.keys()
	for k in data[a[0]]:
	new[k] = {}
	for w in data:
	for k in data[w]:
	new[k][w] = data[w][k]
	return new


	def parse_date(tweet):
	last_year_date = re.findall('(\d+\s\D{3}\s2015)', tweet)
	if last_year_date:
	return parser.parse(last_year_date[0])
	this_january_date = re.findall('\sJan\s(\d+)', tweet)
	# TODO: This month searcher could be improved to whole this year searcher like last year one.
	if this_january_date:
	return parser.parse("%s Jan 2016" % (this_january_date[0]))
	if (re.findall('([0-9]+)\shour', tweet)): # if today?
	return parser.parse('On')
	return None


	def parse_tweets(tweets):
	prsd_tweets = {}
	for i in range(len(tweets)):
	prsd_tweets[i] = {}
	prsd_tweets[i]['date'] = parse_date(tweets[i])
	prsd_tweets[i]['text'] = tweets[i]
	return prsd_tweets


	def parse_tweets_by_date(tweets):
	# {datetime: { id:'', text:'',date:'' } }
	prsd_dated_tweets = {}
	for i in range(len(tweets)):
	if not parse_date(tweets[i]) in prsd_dated_tweets:
	prsd_dated_tweets[parse_date(tweets[i])] = [tweets[i]]
	else:
	prsd_dated_tweets[parse_date(tweets[i])].append(tweets[i])
	return prsd_dated_tweets


	def count_word_in_str(word, string):
	return len(re.findall(word, string.lower()))


	def count_word_in_tweets(word, tweets):
	counter = {}
	for k in tweets:
	counter[k] = count_word_in_str(word, ' '.join(tweets[k]))
	counter.pop(None, None)
	return counter


	def count_word_in_tweets_for_df(word):
	counter = {'Date': [], word: []}
	for k in prsd_dated_tweets:
	counter['Date'].append(k)
	messages = ' '.join(prsd_dated_tweets[k])
	n = count_word_in_str(word, messages)
	counter[word].append(n)
	return counter


	def find_files(folder):
	filelist = []
	for counter, files in enumerate(glob.glob(folder + "/*.txt")):
	filelist.append(files)
	print files
	return filelist


	def read_files(filelist):
	text = ''
	for fileitem in filelist:
	f = open(fileitem, 'r')
	temp = f.read()
	f.close()
	text += temp
	return text


	def split_text_to_tweets(text):
	return re.split('\d\slike', text)


	def twitter_wordcount(text, quantity):
	regex = re.compile('[^a-zA-Z]')
	wordcount = {}
	for word in text.split():
	word = word.lower()
	word = regex.sub('', word)
	if word not in often_words and not word.isdigit() and word not in mostcommon:
	if word not in wordcount:
	wordcount[word] = 1
	else:
	wordcount[word] += 1

	return sorted(wordcount.iteritems(), key=operator.itemgetter(1), reverse=True)[:quantity]

	if __name__ == '__main__':
	print 'Found files'
	text = read_files(find_files('data'))
	tweets = split_text_to_tweets(text)
	print 'Total amount of tweets: %s' % len (tweets)
	print 'Fifty most common words in tweets:'
	print twitter_wordcount(text, 50)
	tweets = split_text_to_tweets(text)
	prsd_tweets = parse_tweets(tweets)
	prsd_dated_tweets = parse_tweets_by_date(tweets)