Skip to content

Instantly share code, notes, and snippets.

@andreiolariu
Created November 1, 2011 19:36
Show Gist options
  • Save andreiolariu/1331665 to your computer and use it in GitHub Desktop.
Save andreiolariu/1331665 to your computer and use it in GitHub Desktop.
Extracting the most used phrases in a text document
# for more info check out http://webmining.olariu.org/is-winter-really-coming
import re
from math import log, sqrt
import matplotlib.pyplot as pyplot
DEPTH = 3 # minimum depth for tree construction = minimum phrase length
OCCURRENCES = 10 # minimum number of phrase occurrences
text = open('game.txt').read() # reading input data
text = text.lower()
text = text.replace('\xe2\x80\x99', '') # treating apostrophes
words = re.findall(r'\w+', text) # spliting words
# constructing the tree of phrases
tree = {}
# first, phrases of length 1 (aka words) are constructed
for i in range(len(words)):
word = words[i]
if word not in tree:
tree[word] = {'positions': []}
tree[word]['positions'].append(i)
# then, as long as they are frequent enough,
# we go deeper in constructing the tree
def expand(node):
if len(node['positions']) >= OCCURRENCES:
node['children'] = {}
for p in node['positions']:
if p + 1 < len(words):
word = words[p + 1]
if word not in node['children']:
node['children'][word] = {'positions': []}
node['children'][word]['positions'].append(p + 1)
for child in node['children'].itervalues():
expand(child)
for node in tree.itervalues():
expand(node)
# now we extract the frequent phrases of length > DEPTH
# we also compute a score based on the phrase length (in characters)
# and on the number of occurrences
phrases = []
def search(node, before, depth):
if len(node['positions']) >= OCCURRENCES:
if depth >= DEPTH:
phrase = {
'text': before,
'score': len(before) * log(len(node['positions'])),
}
phrases.append(phrase)
for word, child in node['children'].iteritems():
search(child, before + ' ' + word, depth + 1)
for word, node in tree.iteritems():
search(node, word, 1)
# sort by score
phrases.sort(key=lambda x: -x['score'])
for p in phrases[:100]:
print p['text']
# plot results
pyplot.plot([p['score'] for p in phrases], \
[log(i + 1) for i in range(len(phrases))], 'ro')
pyplot.xlabel('phrase score')
pyplot.ylabel('log(phrase rank)')
pyplot.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment