Last active
August 29, 2015 14:00
-
-
Save gerbal/05bff2dcfcfdf7d373de to your computer and use it in GitHub Desktop.
Simple Definition Extraction attempt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk.data | |
import nltk | |
from nltk import tokenize | |
from nltk import tag | |
from nltk import chunk | |
try: | |
import xml.etree.cElementTree as ET | |
except ImportError: | |
import xml.etree.ElementTree as ET | |
import re | |
import cPickle as pickle | |
# An input file from the Stack Exchange Data Dumpt should go here. Posts.xml is most interesting. | |
inputFile = open("input/Posts.xml", 'rb') | |
tree = ET.parse(inputFile) | |
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') | |
# A RegEx to find things that look like URLS and remove them | |
re_URL = re.compile(ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))') | |
# Colors so we can get better user input when we ask the user to select | |
# definitional sentence candidates | |
class color: | |
PURPLE = '\033[95m' | |
CYAN = '\033[96m' | |
DARKCYAN = '\033[36m' | |
BLUE = '\033[94m' | |
GREEN = '\033[92m' | |
YELLOW = '\033[93m' | |
RED = '\033[91m' | |
BOLD = '\033[1m' | |
UNDERLINE = '\033[4m' | |
END = '\033[0m' | |
# Not the best method of definition sentence identification, but it's very simple to implement | |
patternList = [ | |
"defined as", | |
"defined by", | |
"define", | |
"defines", | |
"definition of", | |
"a definition", | |
"the definition", | |
"comprise", | |
"comprises", | |
"denote", | |
"denotes", | |
"designate", | |
"designates", | |
"called", | |
"known as" | |
] | |
# Massively reduces false positives, improving precision, but hurting recall | |
# Is it worth it? I don't know | |
excludeList = [ | |
"so-called", | |
"undefined", | |
"called with", | |
"predefined" | |
] | |
# Extract all questions, answers, and comments from 'Posts.xml' and put them in a dictionary. | |
temporaryDictionary = {} | |
for row in tree.findall('row'): | |
if not row.get("ParentId"): | |
temporaryDictionary[ | |
row.get("Id")] = [nltk.clean_html(row.get("Body")).lower()] | |
elif row.get("ParentId") in temporaryDictionary.keys(): | |
temporaryDictionary[ | |
row.get("ParentId") | |
].append(nltk.clean_html(row.get("Body")).lower()) | |
# And now we're going to undo all our work with the dictionary and chunk all of that text into a big list of sentences | |
sentences = [] | |
for item in temporaryDictionary: | |
tmpSentence = "" | |
for definition in temporaryDictionary[item]: | |
tmpSentence = tmpSentence + " " + re_URL.sub("", definition) | |
sentences.extend(tokenizer.tokenize(tmpSentence)) | |
# sentences = [] | |
# for row in tree.findall('row'): | |
# string = nltk.clean_html(row.get("Body")).lower() | |
# sentences.extend(tokenizer.tokenize(string)) | |
def highlight_words(sentence, words): | |
''' | |
A simple function to highlight any of our desired words in a string of text, this | |
is why we have the colors class up at the top | |
''' | |
newsentence = sentence | |
for pattern in words: | |
if pattern in sentence: | |
newsentence = re.sub( | |
r'' + pattern + "", color.UNDERLINE + color.BOLD + pattern + color.END, newsentence) | |
return newsentence | |
# Go through the list of sentences, and if they have any of the patterns in them | |
# ask the user if they are actual definitional sentences | |
# It occurs to me now that I don't have a good standard for what consitutes definitional | |
definitions = [] | |
for sentence in sentences: | |
# print sentence | |
if any(pattern in sentence for pattern in patternList) and not any(pattern in sentence for pattern in excludeList): | |
print "\n-\n-\n-\n-\n-\n" | |
print highlight_words(sentence, patternList) | |
is_definition = raw_input( | |
"\n------\nDoes this look like a defintion? (y/n):\n------\n") | |
if is_definition == "y": | |
definitions.append(sentence) | |
# And write out that list of sentences to a file, using one of the wierdest named python libraries | |
pickle.dump(definitions, open("output/training_defintions.p", "wb")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment