Created
June 19, 2015 14:37
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import nltk | |
#import matplotlib.pyplot as plt | |
#import utils as tech | |
from nltk.corpus import stopwords | |
from nltk import bigrams,trigrams | |
from pprint import pprint | |
READ = 'r' | |
corpus = [json.loads(item) for item in json.load(open('twitter-output.json',READ))] | |
def cleanse(data,remove_stopwords=True): | |
#extract text | |
corpus = [datum['text'].lower().split() for datum in data] | |
#remove URLs and stopwords | |
corpus = [[word for word in text if not word.startswith('http://') | |
and word not in stopwords.words('english')] for text in corpus] | |
#remove unicode | |
corpus = [[word for word in text if all([ord(ch)<128 for ch in word])] for text in corpus] | |
return corpus | |
def extract_entities(selector,tweet): | |
'''Tweet is a list of words''' | |
return [word for word in tweet if word.startswith(selector)] | |
def extract_hashtags(tweet): | |
'''Tweet is a list of words''' | |
return extract_entities('#',tweet) | |
def extract_people(tweet): | |
'''Tweet is a list of words''' | |
return extract_entities('@',tweet) | |
tokens = [text for tweet in cleanse(corpus) for text in tweet] | |
word_frequencies = nltk.FreqDist(tokens) | |
bi_tokens = bigrams(tokens) | |
tri_tokens = trigrams(tokens) | |
#pprint([bi_tokens]) | |
pprint(list(bi_tokens)) | |
pprint(list(tri_tokens) | |
''' | |
Exercises: | |
1. Partition data | |
2. | |
''' | |
''' Data Visualization''' | |
fig = plt.figure() | |
ax = fig.add_subplot(111) | |
words,freqs = zip(*word_frequencies.most_common(25)) | |
ax.plot(freqs,'k--',linewidth=2) | |
tech.adjust_spines(ax) | |
ax.set_xticks(range(len(words))) | |
ax.set_xticklabels(words,rotation='vertical',weight='bold') | |
ax.set_ylabel('Count') | |
plt.tight_layout() | |
plt.savefig('word-frequency') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment