Created
February 22, 2012 06:27
-
-
Save thisismattmiller/1882244 to your computer and use it in GitHub Desktop.
Script to parse visualMOA.org SGML files using NLTK
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import optparse | |
import sys | |
from lxml import etree | |
import time | |
import nltk | |
from pymongo import Connection | |
import os | |
import hashlib | |
tokenizer_sentences = None | |
tokenizer_words = None | |
tagger = None | |
db = None | |
connection = None | |
posts = None | |
def main(): | |
global tokenizer_sentences | |
global tokenizer_words | |
global tagger | |
global db | |
global connection | |
global posts | |
p = optparse.OptionParser() | |
p.add_option('--dir', '-d', default="") | |
options, arguments = p.parse_args() | |
if options.dir == "": | |
print 'No file given' | |
sys.exit() | |
tokenizer_sentences = nltk.data.load('tokenizers/punkt/english.pickle') | |
tokenizer_words = nltk.tokenize.RegexpTokenizer(r'\w+|[^\w\s]+') | |
tagger = nltk.UnigramTagger(nltk.corpus.brown.tagged_sents()) | |
connection = Connection() | |
connection = Connection('localhost', 27017) | |
db = connection.moa | |
posts = db.journals | |
dirList=os.listdir(options.dir) | |
for fname in dirList: | |
process_file(options.dir + fname) | |
def process_file(file_name): | |
global tokenizer_sentences | |
global tokenizer_words | |
global tagger | |
global db | |
global connection | |
global posts | |
print "\n\n" + file_name + ":" | |
try: | |
data_string = open(file_name, 'r').read() | |
except IOError: | |
print 'Could not find that file' | |
sys.exit() | |
data_tree = etree.fromstring(data_string) | |
found_count = 0; | |
for tei in data_tree.findall('.//TEI.2'): | |
serial_date = '' | |
for date_serial in tei.findall('.//DATE'): | |
#is this the scan date or what? | |
try: | |
int(date_serial.text) | |
except ValueError: | |
#it is the journal date | |
print "\t" + date_serial.text | |
#fix some little mistakes from the data imputers | |
date_serial.text = date_serial.text.replace(', ',' ') | |
date_serial.text = date_serial.text.replace(',',' ') | |
date_serial.text = date_serial.text.replace('. ',' ') | |
date_serial.text = date_serial.text.replace('.',' ') | |
if (date_serial.text.find('-') != -1): | |
date_serial.text = date_serial.text[0:date_serial.text.find('-')-1] + ' ' + date_serial.text[len(date_serial.text)-4:] | |
date_serial.text = date_serial.text.replace('Octoberl','October') | |
try: | |
serial_date = time.strptime(date_serial.text, "%B %Y") | |
except ValueError: | |
try: | |
date_serial.text = date_serial.text.replace('Sept ','Sep ') | |
serial_date = time.strptime(date_serial.text, "%b %Y") | |
except ValueError: | |
#it might be in a month day year format | |
try: | |
serial_date = time.strptime(date_serial.text, "%b %d %Y") | |
except ValueError: | |
#ugh.... | |
try: | |
serial_date = time.strptime(date_serial.text, "%B %d %Y") | |
except ValueError: | |
#give up, ask for help | |
try: | |
print date_serial.text + ' could not be phrased as a date!' | |
date_serial.text = raw_input("Enter date in format (Mmm yyyy): ") | |
serial_date = time.strptime(date_serial.text, "%b %Y") | |
except ValueError: | |
print date_serial.text + ' could not be phrased as a date!' | |
sys.exit() | |
if serial_date == '': | |
#this might be the 'notes of digital production section, so if so its okay' there are no PBs in this section | |
if date_serial.text != '1999': | |
print "Error: Could not locate a data for this section" | |
sys.exit() | |
for div1 in tei.findall('.//DIV1'): | |
article_title = '' | |
article_type = '' | |
for title in div1.findall('.//TITLE'): | |
if title.attrib['TYPE'] is None: | |
article_type = 'Unkown' | |
else: | |
article_type = title.attrib['TYPE'] | |
if title.text is None: | |
article_title = 'Unkown Title' | |
else: | |
article_title = title.text | |
print "\t" + article_title + ' :: ' + article_type, | |
if div1.attrib['TYPE'] is None: | |
article_meta_type = 'Unkown Type' | |
else: | |
article_meta_type = div1.attrib['TYPE'] | |
if div1.attrib['DECLS'] is None: | |
article_meta_decls = 'Unkown Decls' | |
else: | |
article_meta_decls = div1.attrib['DECLS'] | |
text_article = '' | |
text_segment = '' | |
for text in div1.findall('.//PB'): | |
print ".", | |
found_count=found_count+1 | |
article_meta_page = text.attrib['REF'] | |
article_meta_seq = text.attrib['SEQ'] | |
text_segment = str(text.text) | |
text_segment = text_segment.replace('~\n','') | |
text_segment = text_segment.replace('-\n','') | |
text_segment = text_segment.replace('\n',' ') | |
text_segment = text_segment.replace('\t','') | |
text_segment = text_segment.strip() | |
text_sentences = (tokenizer_sentences.tokenize(text_segment)) | |
for sentence in text_sentences: | |
sentence=sentence.replace('\n','') | |
sentence=sentence.replace('\t','') | |
sentence=sentence.replace('\r','') | |
#this shit is seriously dirty :( | |
if len(sentence) > 30: | |
#print sentence + "\n--" | |
tokenized = tokenizer_words.tokenize(sentence) | |
tagged = tagger.tag(tokenized) | |
#print tagged | |
nouns=[] | |
verbs=[] | |
propers = [] | |
for index, item in enumerate(tagged): | |
#print index, item | |
#print item[0] | |
noun_proper = '' | |
#first find the proper nouns | |
if (item[1] == 'NP') or (item[1] == 'NP-TL') or (item[1] == 'NPS-TL') or (item[1] == 'NNP') or (item[1] == 'NN$ | |
#now see if there is another proper noun afterwards meanin a possible name | |
noun_proper = item[0] | |
#see if there is even the possiblity | |
if index+1 <= len(tagged) - 1: | |
if (tagged[index+1][0].istitle() == True): | |
#print "Two word proper noun!" + tagged[index][0] + ' ' + tagged[index+1][0] | |
noun_proper = tagged[index][0] + ' ' + tagged[index+1][0] | |
if index+2 <= len(tagged) - 1: | |
if tagged[index+2][0].istitle() == True: | |
#print "Three word proper noun!" | |
noun_proper = tagged[index][0] + ' ' + tagged[index+1][0] + ' ' + tagge$ | |
#print noun_proper | |
#some times we get a word part of the proper noun that is not a noun, such as "new york" | |
if noun_proper.find(' ') == -1: | |
if index-1 >= 0: | |
if (tagged[index-1][0].istitle() == True): | |
noun_proper = tagged[index-1][0] + ' ' + tagged[index][0] | |
if (noun_proper!='') and noun_proper not in propers: | |
if (len(noun_proper))>2: | |
propers.append(noun_proper.lower()) | |
#now find the rest of teh nouns | |
if (item[1] == 'NN') or (item[1] == 'NNS') or (item[1] == 'NN-TL') or (item[1] == 'NNS-TL') or (item[1] == None$ | |
#make sure it is not in the proper noun if we found it above | |
if noun_proper.find(item[0]) == -1: | |
#see if it has a imporant adjetive for before it, (that is not the first word of the sentance) | |
if index-1 > 0: | |
if tagged[index-1][0].istitle() == True: | |
nouns.append(tagged[index-1][0].lower() + ' ' + item[0].lower()) | |
else: | |
nouns.append(item[0].lower()) | |
#verbs | |
if (item[1] != None) and (item[1][0:2] == 'VB'): | |
verbs.append(item[0].lower()) | |
post = {"date": float(str(serial_date.tm_year) + '.' + str(serial_date.tm_mon)), | |
"year": int(serial_date.tm_year), | |
"article_title": article_title, | |
"article_type" : article_type, | |
"text_meta_type" : article_meta_type, | |
"text_meta_decls" : article_meta_decls, | |
"text_meta_page" : article_meta_page, | |
"article_id" : hashlib.md5(article_meta_decls + article_title).hexdigest(), | |
"sentence" : sentence, | |
"nouns" : nouns, | |
"verbs" : verbs, | |
"propers" : propers} | |
posts.insert(post) | |
print "~" | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment