Created
November 1, 2011 19:36
-
-
Save andreiolariu/1331665 to your computer and use it in GitHub Desktop.
Extracting the most used phrases in a text document
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# for more info check out http://webmining.olariu.org/is-winter-really-coming | |
import re | |
from math import log, sqrt | |
import matplotlib.pyplot as pyplot | |
DEPTH = 3 # minimum depth for tree construction = minimum phrase length | |
OCCURRENCES = 10 # minimum number of phrase occurrences | |
text = open('game.txt').read() # reading input data | |
text = text.lower() | |
text = text.replace('\xe2\x80\x99', '') # treating apostrophes | |
words = re.findall(r'\w+', text) # spliting words | |
# constructing the tree of phrases | |
tree = {} | |
# first, phrases of length 1 (aka words) are constructed | |
for i in range(len(words)): | |
word = words[i] | |
if word not in tree: | |
tree[word] = {'positions': []} | |
tree[word]['positions'].append(i) | |
# then, as long as they are frequent enough, | |
# we go deeper in constructing the tree | |
def expand(node): | |
if len(node['positions']) >= OCCURRENCES: | |
node['children'] = {} | |
for p in node['positions']: | |
if p + 1 < len(words): | |
word = words[p + 1] | |
if word not in node['children']: | |
node['children'][word] = {'positions': []} | |
node['children'][word]['positions'].append(p + 1) | |
for child in node['children'].itervalues(): | |
expand(child) | |
for node in tree.itervalues(): | |
expand(node) | |
# now we extract the frequent phrases of length > DEPTH | |
# we also compute a score based on the phrase length (in characters) | |
# and on the number of occurrences | |
phrases = [] | |
def search(node, before, depth): | |
if len(node['positions']) >= OCCURRENCES: | |
if depth >= DEPTH: | |
phrase = { | |
'text': before, | |
'score': len(before) * log(len(node['positions'])), | |
} | |
phrases.append(phrase) | |
for word, child in node['children'].iteritems(): | |
search(child, before + ' ' + word, depth + 1) | |
for word, node in tree.iteritems(): | |
search(node, word, 1) | |
# sort by score | |
phrases.sort(key=lambda x: -x['score']) | |
for p in phrases[:100]: | |
print p['text'] | |
# plot results | |
pyplot.plot([p['score'] for p in phrases], \ | |
[log(i + 1) for i in range(len(phrases))], 'ro') | |
pyplot.xlabel('phrase score') | |
pyplot.ylabel('log(phrase rank)') | |
pyplot.show() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment