-
-
Save ellisonbg/3837783 to your computer and use it in GitHub Desktop.
| """ | |
| Utilities for simple text analysis: word frequencies and co-occurrence graph. | |
| These tools can be used to analyze a plain text file treating it as a list of | |
| newline-separated sentences (e.g. a list of paper titles). | |
| It computes word frequencies (after doing some naive normalization by | |
| lowercasing and throwing away a few overly common words). It also computes, | |
| from the most common words, a weighted graph of word co-occurrences and | |
| displays it, as well as summarizing the graph structure by ranking its nodes in | |
| descending order of eigenvector centrality. | |
| This is meant as an illustration of text processing in Python, using matplotlib | |
| for visualization and NetworkX for graph-theoretical manipulation. It should | |
| not be considered production-strength code for serious text analysis. | |
| Author: Fernando Perez <[email protected]> | |
| """ | |
| #----------------------------------------------------------------------------- | |
| # Imports | |
| #----------------------------------------------------------------------------- | |
| # From the standard library | |
| import os | |
| import re | |
| import urllib2 | |
| # Third-party libraries | |
| import networkx as nx | |
| import numpy as np | |
| from matplotlib import pyplot as plt | |
| #----------------------------------------------------------------------------- | |
| # Function definitions | |
| #----------------------------------------------------------------------------- | |
| def rescale_arr(arr, amin, amax): | |
| """Rescale an array to a new range. | |
| Return a new array whose range of values is (amin, amax). | |
| Parameters | |
| ---------- | |
| arr : array-like | |
| amin : float | |
| new minimum value | |
| amax : float | |
| new maximum value | |
| Examples | |
| -------- | |
| >>> a = np.arange(5) | |
| >>> rescale_arr(a,3,6) | |
| array([ 3. , 3.75, 4.5 , 5.25, 6. ]) | |
| """ | |
| # old bounds | |
| m = arr.min() | |
| M = arr.max() | |
| # scale/offset | |
| s = float(amax-amin)/(M-m) | |
| d = amin - s*m | |
| # Apply clip before returning to cut off possible overflows outside the | |
| # intended range due to roundoff error, so that we can absolutely guarantee | |
| # that on output, there are no values > amax or < amin. | |
| return np.clip(s*arr+d,amin,amax) | |
| def all_pairs(items): | |
| """Make all unique pairs (order doesn't matter)""" | |
| pairs = [] | |
| nitems = len(items) | |
| for i, wi in enumerate(items): | |
| for j in range(i+1, nitems): | |
| pairs.append((wi, items[j])) | |
| return pairs | |
| def removal_set(words, query): | |
| """Create a set of words for removal for a given query.""" | |
| rem = set(words.split()) | |
| qw = [w.lower() for w in query.split()] | |
| for w in qw: | |
| rem.add(w) | |
| rem.add('#' + w) | |
| qq = ''.join(qw) | |
| rem.add(qq) | |
| rem.add('#' + qq) | |
| return rem | |
| def lines_cleanup(lines, min_length=4, remove = None): | |
| """Clean up a list of lowercase strings of text for simple analysis. | |
| Splits on whitespace, removes all 'words' less than `min_length` characters | |
| long, and those in the `remove` set. | |
| Returns a list of strings. | |
| """ | |
| remove = set(remove) if remove is not None else [] | |
| filtered = [] | |
| for line in lines: | |
| a = [] | |
| for w in line.lower().split(): | |
| wnorm = w.rstrip('.,:').replace('[', '').replace(']', '') | |
| if len(wnorm) >= min_length and wnorm not in remove: | |
| a.append(wnorm) | |
| filtered.append(' '.join(a)) | |
| return filtered | |
| def print_vk(lst): | |
| """Print a list of value/key pairs nicely formatted in key/value order.""" | |
| # Find the longest key: remember, the list has value/key paris, so the key | |
| # is element [1], not [0] | |
| longest_key = max([len(word) for word, count in lst]) | |
| # Make a format string out of it | |
| fmt = '%'+str(longest_key)+'s -> %s' | |
| # Do actual printing | |
| for k,v in lst: | |
| print fmt % (k,v) | |
| def word_freq(text): | |
| """Return a dictionary of word frequencies for the given text. | |
| Input text should be given as an iterable of strings.""" | |
| freqs = {} | |
| for word in text: | |
| freqs[word] = freqs.get(word, 0) + 1 | |
| return freqs | |
| def sort_freqs(freqs): | |
| """Sort a word frequency histogram represented as a dictionary. | |
| Parameters | |
| ---------- | |
| freqs : dict | |
| A dict with string keys and integer values. | |
| Return | |
| ------ | |
| items : list | |
| A list of (count, word) pairs. | |
| """ | |
| items = freqs.items() | |
| items.sort(key = lambda wc: wc[1]) | |
| return items | |
| def summarize_freq_hist(freqs, n=10): | |
| """Print a simple summary of a word frequencies dictionary. | |
| Paramters | |
| --------- | |
| freqs : dict or list | |
| Word frequencies, represented either as a dict of word->count, or as a | |
| list of count->word pairs. | |
| n : int | |
| The number of least/most frequent words to print. | |
| """ | |
| items = sort_freqs(freqs) if isinstance(freqs, dict) else freqs | |
| print 'Number of unique words:',len(freqs) | |
| print '%d least frequent words:' % n | |
| print_vk(items[:n]) | |
| print '%d most frequent words:' % n | |
| print_vk(items[-n:]) | |
| def co_occurrences(lines, words): | |
| """Return histogram of co-occurrences of words in a list of lines. | |
| Parameters | |
| ---------- | |
| lines : list | |
| A list of strings considered as 'sentences' to search for co-occurrences. | |
| words : list | |
| A list of words from which all unordered pairs will be constructed and | |
| searched for co-occurrences. | |
| """ | |
| wpairs = all_pairs(words) | |
| # Now build histogram of co-occurrences | |
| co_occur = {} | |
| for w1, w2 in wpairs: | |
| rx = re.compile('%s .*%s|%s .*%s' % (w1, w2, w2, w1)) | |
| co_occur[w1, w2] = sum([1 for line in lines if rx.search(line)]) | |
| return co_occur | |
| def co_occurrences_graph(word_hist, co_occur, cutoff=0): | |
| """Convert a word histogram with co-occurrences to a weighted graph. | |
| Edges are only added if the count is above cutoff. | |
| """ | |
| g = nx.Graph() | |
| for word, count in word_hist: | |
| g.add_node(word, count=count) | |
| for (w1, w2), count in co_occur.iteritems(): | |
| if count<=cutoff: | |
| continue | |
| g.add_edge(w1, w2, weight=count) | |
| return g | |
| # Hack: offset the most central node to avoid too much overlap | |
| rad0 = 0.3 | |
| def centrality_layout(wgraph, centrality): | |
| """Compute a layout based on centrality. | |
| """ | |
| # Create a list of centralities, sorted by centrality value | |
| cent = sorted(centrality.items(), key=lambda x:float(x[1]), reverse=True) | |
| nodes = [c[0] for c in cent] | |
| cent = np.array([float(c[1]) for c in cent]) | |
| rad = (cent - cent[0])/(cent[-1]-cent[0]) | |
| rad = rescale_arr(rad, rad0, 1) | |
| angles = np.linspace(0, 2*np.pi, len(centrality)) | |
| layout = {} | |
| for n, node in enumerate(nodes): | |
| r = rad[n] | |
| th = angles[n] | |
| layout[node] = r*np.cos(th), r*np.sin(th) | |
| return layout | |
| def plot_graph(wgraph, pos=None, fig=None, title=None): | |
| """Conveniently summarize graph visually""" | |
| # config parameters | |
| edge_min_width= 3 | |
| edge_max_width= 12 | |
| label_font = 18 | |
| node_font = 22 | |
| node_alpha = 0.4 | |
| edge_alpha = 0.55 | |
| edge_cmap = plt.cm.Spectral | |
| # Create figure | |
| if fig is None: | |
| fig, ax = plt.subplots() | |
| else: | |
| ax = fig.add_subplot(111) | |
| fig.subplots_adjust(0,0,1) | |
| # Plot nodes with size according to count | |
| sizes = [] | |
| degrees = [] | |
| for n, d in wgraph.nodes_iter(data=True): | |
| sizes.append(d['count']) | |
| degrees.append(wgraph.degree(n)) | |
| sizes = rescale_arr(np.array(sizes, dtype=float), 100, 1000) | |
| # Compute layout and label edges according to weight | |
| pos = nx.spring_layout(wgraph) if pos is None else pos | |
| labels = {} | |
| width = [] | |
| for n1, n2, d in wgraph.edges_iter(data=True): | |
| w = d['weight'] | |
| labels[n1, n2] = w | |
| width.append(w) | |
| width = rescale_arr(np.array(width, dtype=float), edge_min_width, | |
| edge_max_width) | |
| # Draw | |
| nx.draw_networkx_nodes(wgraph, pos, node_size=sizes, node_color=degrees, | |
| alpha=node_alpha) | |
| nx.draw_networkx_edges(wgraph, pos, width=width, edge_color=width, | |
| edge_cmap=edge_cmap, alpha=edge_alpha) | |
| nx.draw_networkx_edge_labels(wgraph, pos, edge_labels=labels, | |
| font_size=label_font) | |
| nx.draw_networkx_labels(wgraph, pos, font_size=node_font, font_weight='bold') | |
| if title is not None: | |
| ax.set_title(title, fontsize=label_font) | |
| ax.set_xticks([]) | |
| ax.set_yticks([]) | |
| # Mark centrality axes | |
| kw = dict(color='k', linestyle='-') | |
| cross = [ax.axhline(0, **kw), ax.axvline(rad0, **kw)] | |
| [ l.set_zorder(0) for l in cross] | |
| def plot_word_histogram(freqs, show=10, title=None): | |
| """Plot a histogram of word frequencies, limited to the top `show` ones. | |
| """ | |
| sorted_f = sort_freqs(freqs) if isinstance(freqs, dict) else freqs | |
| # Don't show the tail | |
| if isinstance(show, int): | |
| # interpret as number of words to show in histogram | |
| show_f = sorted_f[-show:] | |
| else: | |
| # interpret as a fraction | |
| start = -int(round(show*len(freqs))) | |
| show_f = sorted_f[start:] | |
| # Now, extract words and counts, plot | |
| n_words = len(show_f) | |
| ind = np.arange(n_words) | |
| words = [i[0] for i in show_f] | |
| counts = [i[1] for i in show_f] | |
| fig = plt.figure() | |
| ax = fig.add_subplot(111) | |
| if n_words<=20: | |
| # Only show bars and x labels for small histograms, they don't make | |
| # sense otherwise | |
| ax.bar(ind, counts) | |
| ax.set_xticks(ind) | |
| ax.set_xticklabels(words, rotation=45) | |
| fig.subplots_adjust(bottom=0.25) | |
| else: | |
| # For larger ones, do a step plot | |
| ax.step(ind, counts) | |
| # If it spans more than two decades, use a log scale | |
| if float(max(counts))/min(counts) > 100: | |
| ax.set_yscale('log') | |
| if title: | |
| ax.set_title(title) | |
| return ax | |
| def summarize_centrality(centrality): | |
| c = centrality.items() | |
| c.sort(key=lambda x:x[1], reverse=True) | |
| print '\nGraph centrality' | |
| for node, cent in c: | |
| print "%15s: %.3g" % (node, float(cent)) |
cool way of distributing ipython notebooks though!
I've updated the code so that now it's working, you just have to go to here and register an application to get a Consumer Key and a Consumer Secret.
I get a long error while performing the query which is given below:
TwythonAuthError Traceback (most recent call last)
in ()
4 retweets = []
5 for page in range(1, n_pages+1):
----> 6 search = twitter.search(q=query+' lang:en', page=str(page))
7 res = search['results']
8 if not res:
/usr/local/lib/python2.7/dist-packages/twython/endpoints.pyc in search(self, **params)
134
135 """
--> 136 return self.get('search/tweets', params=params)
137 search.iter_mode = 'id'
138 search.iter_key = 'statuses'
/usr/local/lib/python2.7/dist-packages/twython/api.pyc in get(self, endpoint, params, version)
228 def get(self, endpoint, params=None, version='1.1'):
229 """Shortcut for GET requests via :class:request"""
--> 230 return self.request(endpoint, params=params, version=version)
231
232 def post(self, endpoint, params=None, version='1.1'):
/usr/local/lib/python2.7/dist-packages/twython/api.pyc in request(self, endpoint, method, params, version)
222 url = '%s/%s.json' % (self.api_url % version, endpoint)
223
--> 224 content = self._request(url, method=method, params=params, api_call=url)
225
226 return content
/usr/local/lib/python2.7/dist-packages/twython/api.pyc in _request(self, url, method, params, api_call)
192 raise ExceptionType(error_message,
193 error_code=response.status_code,
--> 194 retry_after=response.headers.get('retry-after'))
195
196 # if we have a json error here, then it's not an official Twitter API error
TwythonAuthError: Twitter API returned a 400 (Bad Request), Bad Authentication data
Hi
I tried to run
wgraph = nx.connected_component_subgraphs(wgraph)[0]
but I received
TypeError Traceback (most recent call last)
in ()
5 co_occur = co_occurrences(twt_sent, pop_words)
6 wgraph = co_occurrences_graph(popular, co_occur, cutoff=1)
----> 7 wgraph = nx.connected_component_subgraphs(wgraph)[0]
TypeError: 'generator' object is not subscriptable
This doesn't work any longer. Your twitter requests need to be authorized...
TwythonError: 'Bad Request: The request was invalid. An accompanying error message will explain why. This is the status code will be returned during rate limiting. -- An error occurred processing your request.'
https://pypi.python.org/pypi/twython
The link above describes the form but getting authorization was beyond me. If I figure it out I will post a real solution but for now i just hope this saves someone the time I wasted.