Simple Definition Extraction attempt
import nltk
from nltk import tokenize
from nltk import tag
from nltk import chunk
import xml.etree.cElementTree as ET
except ImportError:
import xml.etree.ElementTree as ET
import re
import cPickle as pickle
# An input file from the Stack Exchange Data Dumpt should go here. Posts.xml is most interesting.
inputFile = open("input/Posts.xml", 'rb')
tree = ET.parse(inputFile)
tokenizer ='tokenizers/punkt/english.pickle')
# A RegEx to find things that look like URLS and remove them
re_URL = re.compile(ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')
# Colors so we can get better user input when we ask the user to select
# definitional sentence candidates
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
# Not the best method of definition sentence identification, but it's very simple to implement
patternList = [
"defined as",
"defined by",
"definition of",
"a definition",
"the definition",
"known as"
# Massively reduces false positives, improving precision, but hurting recall
# Is it worth it? I don't know
excludeList = [
"called with",
# Extract all questions, answers, and comments from 'Posts.xml' and put them in a dictionary.
temporaryDictionary = {}
for row in tree.findall('row'):
if not row.get("ParentId"):
row.get("Id")] = [nltk.clean_html(row.get("Body")).lower()]
elif row.get("ParentId") in temporaryDictionary.keys():
# And now we're going to undo all our work with the dictionary and chunk all of that text into a big list of sentences
sentences = []
for item in temporaryDictionary:
tmpSentence = ""
for definition in temporaryDictionary[item]:
tmpSentence = tmpSentence + " " + re_URL.sub("", definition)
# sentences = []
# for row in tree.findall('row'):
# string = nltk.clean_html(row.get("Body")).lower()
# sentences.extend(tokenizer.tokenize(string))
def highlight_words(sentence, words):
A simple function to highlight any of our desired words in a string of text, this
is why we have the colors class up at the top
newsentence = sentence
for pattern in words:
if pattern in sentence:
newsentence = re.sub(
r'' + pattern + "", color.UNDERLINE + color.BOLD + pattern + color.END, newsentence)
return newsentence
# Go through the list of sentences, and if they have any of the patterns in them
# ask the user if they are actual definitional sentences
# It occurs to me now that I don't have a good standard for what consitutes definitional
definitions = []
for sentence in sentences:
# print sentence
if any(pattern in sentence for pattern in patternList) and not any(pattern in sentence for pattern in excludeList):
print "\n-\n-\n-\n-\n-\n"
print highlight_words(sentence, patternList)
is_definition = raw_input(
"\n------\nDoes this look like a defintion? (y/n):\n------\n")
if is_definition == "y":
# And write out that list of sentences to a file, using one of the wierdest named python libraries
pickle.dump(definitions, open("output/training_defintions.p", "wb"))
# Open the sentences identified in part 1
sentences = pickle.load(open("output/training_defintions.p", "rb"))
string = []
tree = []
# Tokenize them and tag them, nothing to complicated
for sentence in sentences:
chunked = tokenize.word_tokenize(
sentence.encode(encoding='UTF-8', errors='replace'))
tagged_sent = tag.pos_tag(chunked)
pickle.dump(tree, open("output/tagged_sents.p", "wb"))
# Print it all out, for debugging and to be stared at in confusion
for line in tree:
print line
import nltk
import cPickle as pickle
import pprint
# Attempting to use a grammar model to find patterns in definitional sentences
grammar = r"""
NP: {<DT|JJ|NN.*>+} # Chunk sequences of DT, JJ, NN
PP: {<IN><NP>} # Chunk prepositions followed by NP
VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
CLAUSE: {<NP><VP>} # Chunk NP, VP
tree = pickle.load(open("output/tagged_sents.p", "rb"))
cp = nltk.RegexpParser(grammar)
patternList = [
"defined as",
"defined by",
"definition of",
"a definition",
"the definition",
"known as"
docs = []
def checkList(word, pos, list):
A simple function to check if a word is a desired word and chang it (or its part of speech's)
color. This function exists because I was trying to identify common patterns in definitional sentences
I ran out of time before I could finish
if any(pattern == word for pattern in list):
return color.BOLD + color.UNDERLINE + pos + color.END
return word
treed_sents = []
# I'm not sure what this is all doing at this point. The goal of this code is to
# help identify part-of-speech patterns, but I got sidetracked by word-lattices
# before I could finish it. Probably just needed to get a giant test set and try to
# use statistics to determine what traits (or clusters of traits, a la WCL) are common
# to definitional sentences
# print sent.leaves()
# print sent.subtrees
for sent in tree:
# print sent
for subtree in sent.subtrees():
# for minitree in subtree.subtrees():
# print " ".join([b for (a,b) in subtree.leaves()])
# print subtree
docs.append(" ".join([checkList(a, b, patternList)
for (a, b) in subtree.leaves()]))
test_tree = cp.parse(subtree)
# print test_tree
# for tiny_tree in test_tree.subtrees():
# if tiny_tree.node == 'NP':
# print tiny_tree
for line in docs:
# tree = cp.parse(line)
# for subtree in tree.subtrees():
# if subtree.node == 'CHUNK':
# print subtree
for struct in treed_sents:
print struct.pprint()
