thisismattmiller · February 22, 2012 06:27
diff --git a/gistfile1.py b/gistfile1.py
 #!/usr/bin/env python
 import optparse

 import sys
 from lxml import etree
 import time
 import nltk
 from pymongo import Connection
 import os
 import hashlib

 tokenizer_sentences = None
 tokenizer_words = None
 tagger = None
 db = None
 connection = None
 posts = None

 def main():


        global tokenizer_sentences
        global tokenizer_words
        global tagger
        global db
        global connection
        global posts

        p = optparse.OptionParser()
        p.add_option('--dir', '-d', default="")
        options, arguments = p.parse_args()

        if options.dir == "":
                print 'No file given'
                sys.exit()



        tokenizer_sentences = nltk.data.load('tokenizers/punkt/english.pickle')

        tokenizer_words = nltk.tokenize.RegexpTokenizer(r'\w+|[^\w\s]+')
        tagger = nltk.UnigramTagger(nltk.corpus.brown.tagged_sents())

        connection = Connection()
        connection = Connection('localhost', 27017)

        db = connection.moa
        posts = db.journals


        dirList=os.listdir(options.dir)
        for fname in dirList:
                process_file(options.dir + fname)


 def process_file(file_name):


        global tokenizer_sentences
        global tokenizer_words
        global tagger
        global db
        global connection
        global posts


        print "\n\n" + file_name + ":"


        try:
                data_string = open(file_name, 'r').read()
        except IOError:
                print 'Could not find that file'
                sys.exit()


        data_tree = etree.fromstring(data_string)

        found_count = 0;



        for tei in data_tree.findall('.//TEI.2'):


                serial_date = ''

                for date_serial in tei.findall('.//DATE'):

                        #is this the scan date or what?
                        try:
                                int(date_serial.text)
                        except ValueError:
                                #it is the journal date
                                print "\t" + date_serial.text

                                #fix some little mistakes from the data imputers

                                date_serial.text = date_serial.text.replace(', ',' ')
                                date_serial.text = date_serial.text.replace(',',' ')

                                date_serial.text = date_serial.text.replace('. ',' ')
                                date_serial.text = date_serial.text.replace('.',' ')


                                if (date_serial.text.find('-') != -1):
                                        date_serial.text = date_serial.text[0:date_serial.text.find('-')-1] + ' ' + date_serial.text[len(date_serial.text)-4:]


                                date_serial.text = date_serial.text.replace('Octoberl','October')


                                try:
                                        serial_date = time.strptime(date_serial.text, "%B %Y")
                                except ValueError:



                                        try:
                                                date_serial.text = date_serial.text.replace('Sept ','Sep ')
                                                serial_date = time.strptime(date_serial.text, "%b %Y")
                                        except ValueError:


                                                #it might be in a month day year format
                                                try:
                                                        serial_date = time.strptime(date_serial.text, "%b %d %Y")
                                                except ValueError:


                                                        #ugh....
                                                        try:
                                                                serial_date = time.strptime(date_serial.text, "%B %d %Y")

                                                        except ValueError:



                                                                #give up, ask for help

                                                                try:
                                                                        print date_serial.text + ' could not be phrased as a date!'
                                                                        date_serial.text = raw_input("Enter date in format (Mmm yyyy): ")
                                                                        serial_date = time.strptime(date_serial.text, "%b %Y")
                                                                except ValueError:


                                                                        print date_serial.text + ' could not be phrased as a date!'
                                                                        sys.exit()


                if serial_date == '':
                        #this might be the 'notes of digital production section, so if so its okay' there are no PBs in this section
                        if date_serial.text != '1999':
                                print "Error: Could not locate a data for this section"
                                sys.exit()





                for div1 in tei.findall('.//DIV1'):

                        article_title = ''
                        article_type = ''


                        for title in div1.findall('.//TITLE'):

                                if title.attrib['TYPE'] is None:
                                        article_type = 'Unkown'
                                else:
                                        article_type = title.attrib['TYPE']

                                if title.text is None:
                                        article_title = 'Unkown Title'
                                else:
                                        article_title = title.text

                                print "\t" + article_title + ' :: ' + article_type,


                        if div1.attrib['TYPE'] is None:
                                article_meta_type = 'Unkown Type'
                        else:
                                article_meta_type = div1.attrib['TYPE']


                        if div1.attrib['DECLS'] is None:
                                article_meta_decls = 'Unkown Decls'
                        else:
                                article_meta_decls = div1.attrib['DECLS']





                        text_article = ''
                        text_segment = ''

                        for text in div1.findall('.//PB'):
                                print ".",
                                found_count=found_count+1

                                article_meta_page = text.attrib['REF']
                                article_meta_seq = text.attrib['SEQ']

                                text_segment = str(text.text)
                                text_segment = text_segment.replace('~\n','')
                                text_segment = text_segment.replace('-\n','')
                                text_segment = text_segment.replace('\n',' ')
                                text_segment = text_segment.replace('\t','')
                                text_segment = text_segment.strip()

                                text_sentences = (tokenizer_sentences.tokenize(text_segment))


                                for sentence in text_sentences:
                                        sentence=sentence.replace('\n','')
                                        sentence=sentence.replace('\t','')
                                        sentence=sentence.replace('\r','')

                                        #this shit is seriously dirty :(
                                        if len(sentence) > 30:
                                                #print sentence + "\n--"
                                                tokenized = tokenizer_words.tokenize(sentence)
                                                tagged = tagger.tag(tokenized)
                                                #print tagged

                                                nouns=[]
                                                verbs=[]
                                                propers = []



                                                for index, item in enumerate(tagged):
                                                        #print index, item
                                                        #print item[0]

                                                        noun_proper = ''


                                                        #first find the proper nouns
                                                        if (item[1] == 'NP') or (item[1] == 'NP-TL') or (item[1] == 'NPS-TL')  or (item[1] == 'NNP') or (item[1] == 'NN$
                                                                #now see if there is another proper noun afterwards meanin a possible name
                                                                noun_proper = item[0]

                                                                #see if there is even the possiblity
                                                                if index+1 <= len(tagged) - 1:
                                                                        if (tagged[index+1][0].istitle() == True):
                                                                                #print "Two word proper noun!" + tagged[index][0] + ' ' + tagged[index+1][0]
                                                                                noun_proper = tagged[index][0] + ' ' + tagged[index+1][0]

                                                                                if index+2 <= len(tagged) - 1:
                                                                                        if tagged[index+2][0].istitle() == True:
                                                                                                #print "Three word proper noun!"
                                                                                                noun_proper = tagged[index][0] + ' ' + tagged[index+1][0] + ' ' + tagge$

                                                                                                #print noun_proper


                                                                #some times we get a word part of the proper noun that is not a noun, such as "new york"
                                                                if noun_proper.find(' ') == -1:
                                                                        if index-1 >= 0:
                                                                                if (tagged[index-1][0].istitle() == True):
                                                                                        noun_proper = tagged[index-1][0] + ' ' + tagged[index][0]

                                                        if (noun_proper!='') and noun_proper not in  propers:
                                                                if (len(noun_proper))>2:
                                                                        propers.append(noun_proper.lower())


                                                        #now find the rest of teh nouns
                                                        if (item[1] == 'NN') or (item[1] == 'NNS') or (item[1] == 'NN-TL') or (item[1] == 'NNS-TL') or (item[1] == None$

                                                                #make sure it is not in the proper noun if we found it above

                                                                if noun_proper.find(item[0]) == -1:


                                                                        #see if it has a imporant adjetive for before it, (that is not the first word of the sentance)
                                                                        if index-1 > 0:
                                                                                if tagged[index-1][0].istitle() == True:
                                                                                        nouns.append(tagged[index-1][0].lower() + ' ' + item[0].lower())
                                                                                else:
                                                                                        nouns.append(item[0].lower())


                                                        #verbs
                                                        if (item[1] != None) and (item[1][0:2] == 'VB'):
                                                                verbs.append(item[0].lower())




                                                post = {"date": float(str(serial_date.tm_year) + '.' + str(serial_date.tm_mon)),
                                                        "year": int(serial_date.tm_year),
                                                        "article_title": article_title,
                                                        "article_type" : article_type,
                                                        "text_meta_type" : article_meta_type,
                                                        "text_meta_decls" : article_meta_decls,
                                                        "text_meta_page" : article_meta_page,
                                                        "article_id" : hashlib.md5(article_meta_decls + article_title).hexdigest(),
                                                        "sentence" : sentence,
                                                        "nouns" : nouns,
                                                        "verbs" : verbs,
                                                        "propers" : propers}


                                                posts.insert(post)










                        print "~"


 if __name__ == '__main__':
        main()
	#!/usr/bin/env python
	import optparse

	import sys
	from lxml import etree
	import time
	import nltk
	from pymongo import Connection
	import os
	import hashlib

	tokenizer_sentences = None
	tokenizer_words = None
	tagger = None
	db = None
	connection = None
	posts = None

	def main():


	global tokenizer_sentences
	global tokenizer_words
	global tagger
	global db
	global connection
	global posts

	p = optparse.OptionParser()
	p.add_option('--dir', '-d', default="")
	options, arguments = p.parse_args()

	if options.dir == "":
	print 'No file given'
	sys.exit()



	tokenizer_sentences = nltk.data.load('tokenizers/punkt/english.pickle')

	tokenizer_words = nltk.tokenize.RegexpTokenizer(r'\w+\|[^\w\s]+')
	tagger = nltk.UnigramTagger(nltk.corpus.brown.tagged_sents())

	connection = Connection()
	connection = Connection('localhost', 27017)

	db = connection.moa
	posts = db.journals


	dirList=os.listdir(options.dir)
	for fname in dirList:
	process_file(options.dir + fname)


	def process_file(file_name):


	global tokenizer_sentences
	global tokenizer_words
	global tagger
	global db
	global connection
	global posts


	print "\n\n" + file_name + ":"


	try:
	data_string = open(file_name, 'r').read()
	except IOError:
	print 'Could not find that file'
	sys.exit()


	data_tree = etree.fromstring(data_string)

	found_count = 0;



	for tei in data_tree.findall('.//TEI.2'):


	serial_date = ''

	for date_serial in tei.findall('.//DATE'):

	#is this the scan date or what?
	try:
	int(date_serial.text)
	except ValueError:
	#it is the journal date
	print "\t" + date_serial.text

	#fix some little mistakes from the data imputers

	date_serial.text = date_serial.text.replace(', ',' ')
	date_serial.text = date_serial.text.replace(',',' ')

	date_serial.text = date_serial.text.replace('. ',' ')
	date_serial.text = date_serial.text.replace('.',' ')


	if (date_serial.text.find('-') != -1):
	date_serial.text = date_serial.text[0:date_serial.text.find('-')-1] + ' ' + date_serial.text[len(date_serial.text)-4:]


	date_serial.text = date_serial.text.replace('Octoberl','October')


	try:
	serial_date = time.strptime(date_serial.text, "%B %Y")
	except ValueError:



	try:
	date_serial.text = date_serial.text.replace('Sept ','Sep ')
	serial_date = time.strptime(date_serial.text, "%b %Y")
	except ValueError:


	#it might be in a month day year format
	try:
	serial_date = time.strptime(date_serial.text, "%b %d %Y")
	except ValueError:


	#ugh....
	try:
	serial_date = time.strptime(date_serial.text, "%B %d %Y")

	except ValueError:



	#give up, ask for help

	try:
	print date_serial.text + ' could not be phrased as a date!'
	date_serial.text = raw_input("Enter date in format (Mmm yyyy): ")
	serial_date = time.strptime(date_serial.text, "%b %Y")
	except ValueError:


	print date_serial.text + ' could not be phrased as a date!'
	sys.exit()


	if serial_date == '':
	#this might be the 'notes of digital production section, so if so its okay' there are no PBs in this section
	if date_serial.text != '1999':
	print "Error: Could not locate a data for this section"
	sys.exit()





	for div1 in tei.findall('.//DIV1'):

	article_title = ''
	article_type = ''


	for title in div1.findall('.//TITLE'):

	if title.attrib['TYPE'] is None:
	article_type = 'Unkown'
	else:
	article_type = title.attrib['TYPE']

	if title.text is None:
	article_title = 'Unkown Title'
	else:
	article_title = title.text

	print "\t" + article_title + ' :: ' + article_type,


	if div1.attrib['TYPE'] is None:
	article_meta_type = 'Unkown Type'
	else:
	article_meta_type = div1.attrib['TYPE']


	if div1.attrib['DECLS'] is None:
	article_meta_decls = 'Unkown Decls'
	else:
	article_meta_decls = div1.attrib['DECLS']





	text_article = ''
	text_segment = ''

	for text in div1.findall('.//PB'):
	print ".",
	found_count=found_count+1

	article_meta_page = text.attrib['REF']
	article_meta_seq = text.attrib['SEQ']

	text_segment = str(text.text)
	text_segment = text_segment.replace('~\n','')
	text_segment = text_segment.replace('-\n','')
	text_segment = text_segment.replace('\n',' ')
	text_segment = text_segment.replace('\t','')
	text_segment = text_segment.strip()

	text_sentences = (tokenizer_sentences.tokenize(text_segment))


	for sentence in text_sentences:
	sentence=sentence.replace('\n','')
	sentence=sentence.replace('\t','')
	sentence=sentence.replace('\r','')

	#this shit is seriously dirty :(
	if len(sentence) > 30:
	#print sentence + "\n--"
	tokenized = tokenizer_words.tokenize(sentence)
	tagged = tagger.tag(tokenized)
	#print tagged

	nouns=[]
	verbs=[]
	propers = []



	for index, item in enumerate(tagged):
	#print index, item
	#print item[0]

	noun_proper = ''


	#first find the proper nouns
	if (item[1] == 'NP') or (item[1] == 'NP-TL') or (item[1] == 'NPS-TL') or (item[1] == 'NNP') or (item[1] == 'NN$
	#now see if there is another proper noun afterwards meanin a possible name
	noun_proper = item[0]

	#see if there is even the possiblity
	if index+1 <= len(tagged) - 1:
	if (tagged[index+1][0].istitle() == True):
	#print "Two word proper noun!" + tagged[index][0] + ' ' + tagged[index+1][0]
	noun_proper = tagged[index][0] + ' ' + tagged[index+1][0]

	if index+2 <= len(tagged) - 1:
	if tagged[index+2][0].istitle() == True:
	#print "Three word proper noun!"
	noun_proper = tagged[index][0] + ' ' + tagged[index+1][0] + ' ' + tagge$

	#print noun_proper


	#some times we get a word part of the proper noun that is not a noun, such as "new york"
	if noun_proper.find(' ') == -1:
	if index-1 >= 0:
	if (tagged[index-1][0].istitle() == True):
	noun_proper = tagged[index-1][0] + ' ' + tagged[index][0]

	if (noun_proper!='') and noun_proper not in propers:
	if (len(noun_proper))>2:
	propers.append(noun_proper.lower())


	#now find the rest of teh nouns
	if (item[1] == 'NN') or (item[1] == 'NNS') or (item[1] == 'NN-TL') or (item[1] == 'NNS-TL') or (item[1] == None$

	#make sure it is not in the proper noun if we found it above

	if noun_proper.find(item[0]) == -1:


	#see if it has a imporant adjetive for before it, (that is not the first word of the sentance)
	if index-1 > 0:
	if tagged[index-1][0].istitle() == True:
	nouns.append(tagged[index-1][0].lower() + ' ' + item[0].lower())
	else:
	nouns.append(item[0].lower())


	#verbs
	if (item[1] != None) and (item[1][0:2] == 'VB'):
	verbs.append(item[0].lower())




	post = {"date": float(str(serial_date.tm_year) + '.' + str(serial_date.tm_mon)),
	"year": int(serial_date.tm_year),
	"article_title": article_title,
	"article_type" : article_type,
	"text_meta_type" : article_meta_type,
	"text_meta_decls" : article_meta_decls,
	"text_meta_page" : article_meta_page,
	"article_id" : hashlib.md5(article_meta_decls + article_title).hexdigest(),
	"sentence" : sentence,
	"nouns" : nouns,
	"verbs" : verbs,
	"propers" : propers}


	posts.insert(post)










	print "~"


	if __name__ == '__main__':
	main()