-
-
Save ellisonbg/3837783 to your computer and use it in GitHub Desktop.
""" | |
Utilities for simple text analysis: word frequencies and co-occurrence graph. | |
These tools can be used to analyze a plain text file treating it as a list of | |
newline-separated sentences (e.g. a list of paper titles). | |
It computes word frequencies (after doing some naive normalization by | |
lowercasing and throwing away a few overly common words). It also computes, | |
from the most common words, a weighted graph of word co-occurrences and | |
displays it, as well as summarizing the graph structure by ranking its nodes in | |
descending order of eigenvector centrality. | |
This is meant as an illustration of text processing in Python, using matplotlib | |
for visualization and NetworkX for graph-theoretical manipulation. It should | |
not be considered production-strength code for serious text analysis. | |
Author: Fernando Perez <[email protected]> | |
""" | |
#----------------------------------------------------------------------------- | |
# Imports | |
#----------------------------------------------------------------------------- | |
# From the standard library | |
import os | |
import re | |
import urllib2 | |
# Third-party libraries | |
import networkx as nx | |
import numpy as np | |
from matplotlib import pyplot as plt | |
#----------------------------------------------------------------------------- | |
# Function definitions | |
#----------------------------------------------------------------------------- | |
def rescale_arr(arr, amin, amax): | |
"""Rescale an array to a new range. | |
Return a new array whose range of values is (amin, amax). | |
Parameters | |
---------- | |
arr : array-like | |
amin : float | |
new minimum value | |
amax : float | |
new maximum value | |
Examples | |
-------- | |
>>> a = np.arange(5) | |
>>> rescale_arr(a,3,6) | |
array([ 3. , 3.75, 4.5 , 5.25, 6. ]) | |
""" | |
# old bounds | |
m = arr.min() | |
M = arr.max() | |
# scale/offset | |
s = float(amax-amin)/(M-m) | |
d = amin - s*m | |
# Apply clip before returning to cut off possible overflows outside the | |
# intended range due to roundoff error, so that we can absolutely guarantee | |
# that on output, there are no values > amax or < amin. | |
return np.clip(s*arr+d,amin,amax) | |
def all_pairs(items): | |
"""Make all unique pairs (order doesn't matter)""" | |
pairs = [] | |
nitems = len(items) | |
for i, wi in enumerate(items): | |
for j in range(i+1, nitems): | |
pairs.append((wi, items[j])) | |
return pairs | |
def removal_set(words, query): | |
"""Create a set of words for removal for a given query.""" | |
rem = set(words.split()) | |
qw = [w.lower() for w in query.split()] | |
for w in qw: | |
rem.add(w) | |
rem.add('#' + w) | |
qq = ''.join(qw) | |
rem.add(qq) | |
rem.add('#' + qq) | |
return rem | |
def lines_cleanup(lines, min_length=4, remove = None): | |
"""Clean up a list of lowercase strings of text for simple analysis. | |
Splits on whitespace, removes all 'words' less than `min_length` characters | |
long, and those in the `remove` set. | |
Returns a list of strings. | |
""" | |
remove = set(remove) if remove is not None else [] | |
filtered = [] | |
for line in lines: | |
a = [] | |
for w in line.lower().split(): | |
wnorm = w.rstrip('.,:').replace('[', '').replace(']', '') | |
if len(wnorm) >= min_length and wnorm not in remove: | |
a.append(wnorm) | |
filtered.append(' '.join(a)) | |
return filtered | |
def print_vk(lst): | |
"""Print a list of value/key pairs nicely formatted in key/value order.""" | |
# Find the longest key: remember, the list has value/key paris, so the key | |
# is element [1], not [0] | |
longest_key = max([len(word) for word, count in lst]) | |
# Make a format string out of it | |
fmt = '%'+str(longest_key)+'s -> %s' | |
# Do actual printing | |
for k,v in lst: | |
print fmt % (k,v) | |
def word_freq(text): | |
"""Return a dictionary of word frequencies for the given text. | |
Input text should be given as an iterable of strings.""" | |
freqs = {} | |
for word in text: | |
freqs[word] = freqs.get(word, 0) + 1 | |
return freqs | |
def sort_freqs(freqs): | |
"""Sort a word frequency histogram represented as a dictionary. | |
Parameters | |
---------- | |
freqs : dict | |
A dict with string keys and integer values. | |
Return | |
------ | |
items : list | |
A list of (count, word) pairs. | |
""" | |
items = freqs.items() | |
items.sort(key = lambda wc: wc[1]) | |
return items | |
def summarize_freq_hist(freqs, n=10): | |
"""Print a simple summary of a word frequencies dictionary. | |
Paramters | |
--------- | |
freqs : dict or list | |
Word frequencies, represented either as a dict of word->count, or as a | |
list of count->word pairs. | |
n : int | |
The number of least/most frequent words to print. | |
""" | |
items = sort_freqs(freqs) if isinstance(freqs, dict) else freqs | |
print 'Number of unique words:',len(freqs) | |
print '%d least frequent words:' % n | |
print_vk(items[:n]) | |
print '%d most frequent words:' % n | |
print_vk(items[-n:]) | |
def co_occurrences(lines, words): | |
"""Return histogram of co-occurrences of words in a list of lines. | |
Parameters | |
---------- | |
lines : list | |
A list of strings considered as 'sentences' to search for co-occurrences. | |
words : list | |
A list of words from which all unordered pairs will be constructed and | |
searched for co-occurrences. | |
""" | |
wpairs = all_pairs(words) | |
# Now build histogram of co-occurrences | |
co_occur = {} | |
for w1, w2 in wpairs: | |
rx = re.compile('%s .*%s|%s .*%s' % (w1, w2, w2, w1)) | |
co_occur[w1, w2] = sum([1 for line in lines if rx.search(line)]) | |
return co_occur | |
def co_occurrences_graph(word_hist, co_occur, cutoff=0): | |
"""Convert a word histogram with co-occurrences to a weighted graph. | |
Edges are only added if the count is above cutoff. | |
""" | |
g = nx.Graph() | |
for word, count in word_hist: | |
g.add_node(word, count=count) | |
for (w1, w2), count in co_occur.iteritems(): | |
if count<=cutoff: | |
continue | |
g.add_edge(w1, w2, weight=count) | |
return g | |
# Hack: offset the most central node to avoid too much overlap | |
rad0 = 0.3 | |
def centrality_layout(wgraph, centrality): | |
"""Compute a layout based on centrality. | |
""" | |
# Create a list of centralities, sorted by centrality value | |
cent = sorted(centrality.items(), key=lambda x:float(x[1]), reverse=True) | |
nodes = [c[0] for c in cent] | |
cent = np.array([float(c[1]) for c in cent]) | |
rad = (cent - cent[0])/(cent[-1]-cent[0]) | |
rad = rescale_arr(rad, rad0, 1) | |
angles = np.linspace(0, 2*np.pi, len(centrality)) | |
layout = {} | |
for n, node in enumerate(nodes): | |
r = rad[n] | |
th = angles[n] | |
layout[node] = r*np.cos(th), r*np.sin(th) | |
return layout | |
def plot_graph(wgraph, pos=None, fig=None, title=None): | |
"""Conveniently summarize graph visually""" | |
# config parameters | |
edge_min_width= 3 | |
edge_max_width= 12 | |
label_font = 18 | |
node_font = 22 | |
node_alpha = 0.4 | |
edge_alpha = 0.55 | |
edge_cmap = plt.cm.Spectral | |
# Create figure | |
if fig is None: | |
fig, ax = plt.subplots() | |
else: | |
ax = fig.add_subplot(111) | |
fig.subplots_adjust(0,0,1) | |
# Plot nodes with size according to count | |
sizes = [] | |
degrees = [] | |
for n, d in wgraph.nodes_iter(data=True): | |
sizes.append(d['count']) | |
degrees.append(wgraph.degree(n)) | |
sizes = rescale_arr(np.array(sizes, dtype=float), 100, 1000) | |
# Compute layout and label edges according to weight | |
pos = nx.spring_layout(wgraph) if pos is None else pos | |
labels = {} | |
width = [] | |
for n1, n2, d in wgraph.edges_iter(data=True): | |
w = d['weight'] | |
labels[n1, n2] = w | |
width.append(w) | |
width = rescale_arr(np.array(width, dtype=float), edge_min_width, | |
edge_max_width) | |
# Draw | |
nx.draw_networkx_nodes(wgraph, pos, node_size=sizes, node_color=degrees, | |
alpha=node_alpha) | |
nx.draw_networkx_edges(wgraph, pos, width=width, edge_color=width, | |
edge_cmap=edge_cmap, alpha=edge_alpha) | |
nx.draw_networkx_edge_labels(wgraph, pos, edge_labels=labels, | |
font_size=label_font) | |
nx.draw_networkx_labels(wgraph, pos, font_size=node_font, font_weight='bold') | |
if title is not None: | |
ax.set_title(title, fontsize=label_font) | |
ax.set_xticks([]) | |
ax.set_yticks([]) | |
# Mark centrality axes | |
kw = dict(color='k', linestyle='-') | |
cross = [ax.axhline(0, **kw), ax.axvline(rad0, **kw)] | |
[ l.set_zorder(0) for l in cross] | |
def plot_word_histogram(freqs, show=10, title=None): | |
"""Plot a histogram of word frequencies, limited to the top `show` ones. | |
""" | |
sorted_f = sort_freqs(freqs) if isinstance(freqs, dict) else freqs | |
# Don't show the tail | |
if isinstance(show, int): | |
# interpret as number of words to show in histogram | |
show_f = sorted_f[-show:] | |
else: | |
# interpret as a fraction | |
start = -int(round(show*len(freqs))) | |
show_f = sorted_f[start:] | |
# Now, extract words and counts, plot | |
n_words = len(show_f) | |
ind = np.arange(n_words) | |
words = [i[0] for i in show_f] | |
counts = [i[1] for i in show_f] | |
fig = plt.figure() | |
ax = fig.add_subplot(111) | |
if n_words<=20: | |
# Only show bars and x labels for small histograms, they don't make | |
# sense otherwise | |
ax.bar(ind, counts) | |
ax.set_xticks(ind) | |
ax.set_xticklabels(words, rotation=45) | |
fig.subplots_adjust(bottom=0.25) | |
else: | |
# For larger ones, do a step plot | |
ax.step(ind, counts) | |
# If it spans more than two decades, use a log scale | |
if float(max(counts))/min(counts) > 100: | |
ax.set_yscale('log') | |
if title: | |
ax.set_title(title) | |
return ax | |
def summarize_centrality(centrality): | |
c = centrality.items() | |
c.sort(key=lambda x:x[1], reverse=True) | |
print '\nGraph centrality' | |
for node, cent in c: | |
print "%15s: %.3g" % (node, float(cent)) |
cool way of distributing ipython notebooks though!
I've updated the code so that now it's working, you just have to go to here and register an application to get a Consumer Key and a Consumer Secret.
I get a long error while performing the query which is given below:
TwythonAuthError Traceback (most recent call last)
in ()
4 retweets = []
5 for page in range(1, n_pages+1):
----> 6 search = twitter.search(q=query+' lang:en', page=str(page))
7 res = search['results']
8 if not res:
/usr/local/lib/python2.7/dist-packages/twython/endpoints.pyc in search(self, **params)
134
135 """
--> 136 return self.get('search/tweets', params=params)
137 search.iter_mode = 'id'
138 search.iter_key = 'statuses'
/usr/local/lib/python2.7/dist-packages/twython/api.pyc in get(self, endpoint, params, version)
228 def get(self, endpoint, params=None, version='1.1'):
229 """Shortcut for GET requests via :class:request
"""
--> 230 return self.request(endpoint, params=params, version=version)
231
232 def post(self, endpoint, params=None, version='1.1'):
/usr/local/lib/python2.7/dist-packages/twython/api.pyc in request(self, endpoint, method, params, version)
222 url = '%s/%s.json' % (self.api_url % version, endpoint)
223
--> 224 content = self._request(url, method=method, params=params, api_call=url)
225
226 return content
/usr/local/lib/python2.7/dist-packages/twython/api.pyc in _request(self, url, method, params, api_call)
192 raise ExceptionType(error_message,
193 error_code=response.status_code,
--> 194 retry_after=response.headers.get('retry-after'))
195
196 # if we have a json error here, then it's not an official Twitter API error
TwythonAuthError: Twitter API returned a 400 (Bad Request), Bad Authentication data
Hi
I tried to run
wgraph = nx.connected_component_subgraphs(wgraph)[0]
but I received
TypeError Traceback (most recent call last)
in ()
5 co_occur = co_occurrences(twt_sent, pop_words)
6 wgraph = co_occurrences_graph(popular, co_occur, cutoff=1)
----> 7 wgraph = nx.connected_component_subgraphs(wgraph)[0]
TypeError: 'generator' object is not subscriptable
This doesn't work any longer. Your twitter requests need to be authorized...
TwythonError: 'Bad Request: The request was invalid. An accompanying error message will explain why. This is the status code will be returned during rate limiting. -- An error occurred processing your request.'
https://pypi.python.org/pypi/twython
The link above describes the form but getting authorization was beyond me. If I figure it out I will post a real solution but for now i just hope this saves someone the time I wasted.