Last active
August 29, 2015 14:11
-
-
Save fayeip/f18f7431fc48d542efe7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "import nltk\nfrom nltk.corpus import PlaintextCorpusReader\nimport re\nfrom itertools import chain\nfrom nltk import tokenize\nfrom nltk.corpus import stopwords\nimport nltk.data\nimport json\nimport pdb\nfrom collections import defaultdict", | |
"prompt_number": 54, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Importing corpus\ncorpus_root = 'data'\nwordlists = PlaintextCorpusReader(corpus_root, '.*\\\\.txt')\nsent_detector = nltk.data.load('tokenizers/punkt/english.pickle')", | |
"prompt_number": 2, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Clean up Process - create date regex parameters \ndate_pattern = '((J(anuary|u(ne|ly))|February|Ma(rch|y)|A(pril|ugust)|(((Sept|Nov|Dec)em)|Octo)ber).*([0-9]))'\nmp = '(J(anuary|u(ne|ly))|February|Ma(rch|y)|A(pril|ugust)|(((Sept|Nov|Dec)em)|Octo)ber)'\nyp = '[0-9]{4}'", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "# Testing the patterns \ntest = \"This is the month of November 9, 2014\"\ndate = re.search(date_pattern,test)\nm = re.search(mp,date.group(0))\nmonth = m.group(0)\ny = re.search(yp,date.group(0))\nyear = y.group(0)", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "# start processing\n#Set the dictionaries \ncorpus_dict = {}\n\n#Putting it all together\nfor fileid in wordlists.fileids():\n #Part 1: split of xx of DOCUMENTS \n doc_list = re.split('((?m)^\\\\s+[0-9]+\\\\s*of\\\\s*[0-9]+\\\\s+DOCUMENTS)', wordlists.raw(fileid))\n doc_list.pop(0) #got rid of garbage first empty line\n master_list = list() # put all documents by id, header, footer\n #print len(doc_list) # keep for testing -- how many documents within a single file \n \n #Part 2: split into id, head and footer and create a triple tuple \n for idx in range(0, len(doc_list), 2):\n # add a new tuple of id, header, footer\n # split condition in order of importance\n split_conds = ['words\\r\\n\\r\\n', 'Edition\\r\\n\\r\\n', 'Society Desk\\r\\n\\r\\n','Society Desk\\r\\n\\r\\n\\r\\n','DATELINE: Camden, Me.,\\r\\n\\r\\n\\r\\n']\n doc_split = []\n for cond in split_conds:\n doc_split = re.split(cond,doc_list[idx+1], 1)\n if len(doc_split) == 2:\n break\n #Part 2 contd: Error check to see if any of the splits didn't go through \n if len(doc_split) < 2:\n doc_parts = (doc_list[idx], doc_split)\n print \"too few traces\"\n pdb.set_trace()\n elif len(doc_split) > 2:\n print \"too many splits\"\n else:\n doc_parts = (doc_list[idx], doc_split[0], doc_split[1])\n# print doc_split[0]\n# print '<><><><><><><><><>'\n# print doc_split[1]\n# print \"****************************************\"\n master_list.append(doc_parts) #Create that tuple triple \n \n year_counter = []\n #Part 3: Read the header and extract date \n for doc in master_list:\n #Part 3 a: Header cleaning steps \n clean_header = re.sub(r\"\\b(The New York Times|(DATELINE:.*)|(BYLINE.*)|(.*Correction Appended.*)|(SECTION:.*)|(LENGTH:.*)|(LOAD-DATE:.*)|(http:.*)|(LANGUAGE:.*)|(GRAPHIC:.*)|(Copyright.*)|(Late Edition - Final.*))\\b\", \"\", doc[1])\n clean_header = clean_header.replace(\"\\r\",\"\").strip()\n clean_header = [x for x in clean_header.split('\\n') if any(x.isalnum() for x in x)]\n header_final = ' '.join(clean_header)\n\n #Part 3b: Extracting the date\n date = re.search(date_pattern,header_final)\n m = re.search(mp,date.group(0))\n month = m.group(0)\n y = re.search(yp,date.group(0))\n year = y.group(0)\n year_counter.append(year) \n\n if \"Events\" not in header_final:\n body = doc[2]\n clean_sent = re.sub(r\"\\b(The New York Times|(DATELINE:.*)|(SECTION:.*)|(LENGTH:.*)|(LOAD-DATE:.*)|(http:.*)|(URL:.*)|(LANGUAGE:.*)|(PUBLICATION.*)|(GRAPHIC:.*)|(Copyright.*))\\b\", \"\", body)\n body = re.sub('\\r\\n(?!\\r\\n)', ' ',clean_sent)\n\n #Part 4 adding to the dictionary\n corpus_dict.setdefault(year,{}).setdefault(month, []).append((doc[0],header_final,body)) \n \n#Part 5: Write to a JSON file \nwith open('data/dict2014.json', 'wb') as fp:\n json.dump(corpus_dict, fp)\n ", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "# print corpus_dict['1984']['March']", | |
"prompt_number": 55, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Sent to AWS to tag", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Download Stanford NER taggers\nfrom nltk.tag.stanford import POSTagger\nfrom nltk.tag.stanford import NERTagger\npost = POSTagger('lib/stanford-postagger-2014-10-26/models/english-bidirectional-distsim.tagger',\n 'lib/stanford-postagger-2014-10-26/stanford-postagger.jar', 'utf-8')\n\nnert = NERTagger('lib/stanford-ner-2014-10-26/classifiers/english.all.3class.distsim.crf.ser.gz',\n 'lib/stanford-ner-2014-10-26/stanford-ner.jar', 'utf-8')", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Load the entity tagged file as a tuple of tuples \nfrom ast import literal_eval\n\ntagged_1984 = []\n\nwith open('1984_tagged.txt', 'r') as f:\n for line in f:\n line.split(',')\n tagged_1984.append(literal_eval(line.strip()))\n", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Customizing the tagger\n#assigning a custom tag in the word,tag \n\nfrom ast import literal_eval\n\ndef alter_source(sourcefile):\n f_before = open(sourcefile,'r')\n f_before_str = f_before.read()\n f_before.close()\n\n \"\"\"Customizing the tags labeled 'O'\n 1) Widow , Widower, Widowed >> label: W for widow\n 2) Mr., Mrs., Adm., Sgt., Dr. >> label: PERSON\n 3) Rev., Rabbi, priest >> label: R for religious \n 4) bride >> label: B for bride \n 5) bridegroom, groom >> label: G for groom\n\n \"\"\"\n\n f_after_str = ''\n # Adding the custom tag set 1 - widow\n f_after_str_1 = re.sub(r\"\\(\\'widow\\', \\'O\\'\\)\", \"('widow', 'W')\",f_before_str)\n f_after_str_2 = re.sub(r\"\\(\\'widower\\', \\'O\\'\\)\", \"('widower', 'W')\",f_after_str_1)\n f_after_str_3 = re.sub(r\"\\(\\'widowed\\', \\'O\\'\\)\", \"('widowed', 'W')\",f_after_str_2)\n\n #Adding the custom tag set 2 - person \n f_after_str_4 = re.sub(r\"\\(\\'Mr.\\', \\'O\\'\\)\", \"('Mr.', 'PERSON')\",f_after_str_3)\n f_after_str_5 = re.sub(r\"\\(\\'Mrs.\\', \\'O\\'\\)\", \"('Mrs.', 'PERSON')\",f_after_str_4)\n f_after_str_6 = re.sub(r\"\\(\\'Adm.\\', \\'O\\'\\)\", \"('Adm.', 'PERSON')\",f_after_str_5)\n f_after_str_7 = re.sub(r\"\\(\\'Sgt.\\', \\'O\\'\\)\", \"('Sgt.', 'PERSON')\",f_after_str_6)\n f_after_str_8 = re.sub(r\"\\(\\'Dr.\\', \\'O\\'\\)\", \"('Dr.', 'PERSON')\",f_after_str_7)\n\n\n #Adding the custom tag set 3 - religious head \n f_after_str_9 = re.sub(r\"\\(\\'Rev.\\', \\'O\\'\\)\", \"('Rev.', 'R')\",f_after_str_8)\n f_after_str_10 = re.sub(r\"\\(\\'\\bRabbi\\b\\', \\'O\\'\\)\", \"('Rabbi', 'R')\",f_after_str_9)\n f_after_str_11 = re.sub(r\"\\(\\'\\bpriest\\b\\', \\'O\\'\\)\", \"('priest','R')\",f_after_str_10)\n\n # Adding the custom tag set 4 - divorced\n f_after_str_12 = re.sub(r\"\\(\\'\\bdivorce\\b\\', \\'O\\'\\)\", \"('divorce', 'D')\",f_after_str_11)\n f_after_str_13 = re.sub(r\"\\(\\'\\bdivorced\\b\\', \\'O\\'\\)\", \"('divorced', 'D')\",f_after_str_12)\n\n # Adding the custom tag set 4 - divorced\n f_after_str_14 = re.sub(r\"\\(\\'divorce\\', \\'O\\'\\)\", \"sufia\", f_after_str_13)\n f_after_str_15 = re.sub(r\"\\(\\'divorced\\', \\'O\\'\\)\", \"('divorced', 'D')\",f_after_str_14)\n\n # Adding the custom tag set 5 - bride\n f_after_str_16 = re.sub(r\"\\(\\'\\bbride\\b\\', \\'O\\'\\)\", \"('bride', 'B')\",f_after_str_15)\n\n # Adding the custom tag set 6 - bridegroom\n f_after_str_17 = re.sub(r\"\\(\\'\\bbridegroom\\b\\', \\'O\\'\\)\", \"('bridegroom', 'G')\",f_after_str_16)\n f_after_str_final = re.sub(r\"\\(\\'\\bgroom\\b\\', \\'O\\'\\)\", \"('groom', 'G')\",f_after_str_17)\n \n return f_after_str_final\n\n\ndef apply_custom_tags (targetfile, custom_tags):\n f = open(targetfile,'w')\n f.write(custom_tags)\n f.close()\n\n custom_tag_list = []\n\n with open(targetfile, 'r') as g:\n for line in g:\n line.split('\\n')\n custom_tag_list.append(literal_eval(line.strip()))\n return custom_tag_list", | |
"prompt_number": 3, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Run custom tagger for all 5 years\n\ncustom_tagged_1984 = apply_custom_tags('data_tagged2/1984_tagged_custom.txt',alter_source('data_tagged/1984_tagged.txt'))\ncustom_tagged_1990 = apply_custom_tags('data_tagged2/1990_tagged_custom.txt',alter_source('data_tagged/1990_tagged.txt'))\ncustom_tagged_2000 = apply_custom_tags('data_tagged2/2000_tagged_custom.txt',alter_source('data_tagged/2000_tagged.txt'))\ncustom_tagged_2010 = apply_custom_tags('data_tagged2/2010_tagged_custom.txt',alter_source('data_tagged/2010_tagged.txt'))\ncustom_tagged_2014 = apply_custom_tags('data_tagged2/2014_tagged_custom.txt',alter_source('data_tagged/2014_tagged.txt'))", | |
"prompt_number": 4, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "# print custom_tagged_1984 ", | |
"prompt_number": 56, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "# Import the RegexpParser\nfrom nltk.chunk import RegexpParser\n\n# Define your custom tagged data. \n# entities\n\ndef chunker_rules(values):\n # Define custom grammar (modified to be a valid regex).\n grammar = r'''\n PERSON:\n {<PERSON><O><PERSON>+}\n {<PERSON>+}\n ORGANIZATION: \n {<ORGANIZATION>+}\n LOCATION: \n {<LOCATION>+} \n\n '''\n cp = nltk.RegexpParser(grammar) # Create an instance of your custom parser.\n return cp.parse(values) # Parse!\n\ndef entity_chunker(tagged_docs):\n chunks = []\n for doc in tagged_docs:\n tree = chunker_rules(doc)\n for subtree in tree.subtrees():\n if (subtree.node == 'CHUNK'):\n leaflist = [leaf[0] for leaf in subtree.leaves()]\n chunks.append(' '.join(leaflist))\n# if verb in leaflist:\n# chunks.append(' '.join(leaflist))\n return chunks\n ", | |
"prompt_number": 33, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "# Creating a dictionary for each wedding announcement", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "rel_dict = defaultdict(dict)", | |
"prompt_number": 34, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "def extract_rels(rel_dict, dict_key, relKey, rel1, rel2, tree,regex): \n \"\"\"This function extracts the relationships\n Function Outputs:\n final output = {dict_key: [{relKey:[relationship extracted]}, {relKey:[relationship extracted]}]}\n example = {1:[{bride:'Mary Flyn',groom: 'John Mayer'}]}\n \n Function Inputs:\n 1) rel_dict = This is the default dict that will contain all the patterns in a dictionary per wedding announcement\n 2) dict_key = This is basically a counter per wedding announcment \n 3) relKey = This is the second key i.e. the relationship type you want to get values for \n 4) rel1 , rel2, regex = 'PERSON' [the word \"marries\"] 'PERSON'\n 5) tree = the parsed tree\n \"\"\" \n for rel in nltk.sem.extract_rels(rel1, rel2, tree, pattern = regex):\n# print nltk.sem.relextract.show_raw_rtuple(rel)\n# rels_str = nltk.sem.relextract.show_raw_rtuple(rel) \n dict_values = []\n dict_values.append(nltk.sem.relextract.show_raw_rtuple(rel))\n rel_dict[str(dict_key)][relKey] = dict_values\n \n# if relKey in rel_dict[str(dict_key)].keys():\n# rel_dict[str(dict_key)][relKey].append(dict_values)\n# else:\n# rel_dict[str(dict_key)][relKey] = dict_values", | |
"prompt_number": 35, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "def make_rels_dict(tagged_data, rel_dict, relKey, rel1,rel2,regex):\n \"\"\"This function makes the dictionary for the relationships you want to extract -- \n read comments in function \"extract_rels\" for more context\"\"\"\n dict_key = 1\n \n for doc in tagged_data:\n tree = chunker_rules(doc)\n extract_rels (rel_dict, dict_key, relKey, rel1, rel2, tree,regex)\n dict_key +=1\n return rel_dict", | |
"prompt_number": 36, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "DAUGHTER = re.compile(r'.*\\bdaughter\\b')\nmake_rels_dict(custom_tagged_1984,rel_dict,'daughter_relation','PERSON','PERSON',DAUGHTER)\nprint \"DAUGHTER relation done\"", | |
"prompt_number": 57, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "DAUGHTER relation done\n", | |
"stream": "stdout" | |
} | |
], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "# regex to lift out names and remove tags \n\ndef lift_names_remove_tags(relation, extracted_relation_str):\n \n if relation == 'DAUGHTER' or relation == 'SON':\n rels_pattern = r\"\\[PER: (.*)\\] (.*) \\[PER: (.*)\\]\"\n name_pattern = re.compile(r\"(\\w*\\.?)\\/PERSON\")\n names_search = re.search(rels_pattern, extracted_relation_str)\n bride_or_groom_name = name_pattern.findall(names_search.group(1))\n parents_name = name_pattern.findall(names_search.group(3))\n bride_or_groom_name_str = ''\n parents_name_str = ''\n for bg in bride_or_groom_name:\n bride_or_groom_name_str += bg + \" \"\n for p in parents_name:\n parents_name_str += p + \" \"\n \n return bride_or_groom_name_str.strip(), parents_name_str.strip()\n", | |
"prompt_number": 58, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "for key in rel_dict.keys():\n for k in rel_dict[key].keys():\n# print rel_dict[key][k]\n for each in rel_dict[key][k]:\n if k == 'daughter_relation':\n bridename, brideparentsname = lift_names_remove_tags('DAUGHTER',each)\n rel_dict[key]['bride_name'] = bridename\n rel_dict[key]['bride_parents_names'] = brideparentsname ", | |
"prompt_number": 59, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "# print rel_dict", | |
"prompt_number": 60, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "SON = re.compile(r'.*\\bson\\b')\nmake_rels_dict(custom_tagged_1984,rel_dict,'son_relation','PERSON','PERSON',SON) \nprint \"Groom key has been added to master dict\"", | |
"prompt_number": 61, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "Groom key has been added to master dict\n", | |
"stream": "stdout" | |
} | |
], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "for key in rel_dict.keys():\n for k in rel_dict[key].keys():\n# print rel_dict[key][k]\n for each in rel_dict[key][k]:\n if k == 'son_relation':\n groomname, groomparentsname = lift_names_remove_tags('SON',each)\n rel_dict[key]['groom_name'] = groomname\n rel_dict[key]['groom_parents_names'] = groomparentsname ", | |
"prompt_number": 62, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "print rel_dict['347']", | |
"prompt_number": 63, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "{'daughter_relation': [\"[PER: 'Leicia/PERSON Sharon/PERSON Osborne/PERSON'] ',/O the/O daughter/O of/O' [PER: 'Mr./PERSON and/O Mrs./PERSON Philip/PERSON Barry/PERSON Osborne/PERSON']\"], 'bride_parents_names': 'Mr. Mrs. Philip Barry Osborne', 'groom_parents_names': 'Mr. Mrs. John A. Milano', 'son_relation': [\"[PER: 'Michael/PERSON Anthony/PERSON Milano/PERSON'] ',/O a/O son/O of/O' [PER: 'Mr./PERSON and/O Mrs./PERSON John/PERSON A./PERSON Milano/PERSON']\"], 'bride_name': 'Leicia Sharon Osborne', 'groom_name': 'Michael Anthony Milano'}\n", | |
"stream": "stdout" | |
} | |
], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "# To test for empty patterns in the rel_dict\nempty = []\nfor i in range (1):\n if test_dict[str(i)].keys() == []:\n print i\n empty.append(i)\nprint \"final len\", len(empty)", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "# Use functions below to see how well the individual patterns do and then add to the master dictionary", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "# Tester Function to extract the relationships for individual patterns \ndef test_extract_rels (tagged_data, alist, rel1,rel2,regex):\n for doc in tagged_data:\n tree = chunker_rules(doc)\n for rel in nltk.sem.extract_rels(rel1, rel2, tree, pattern = regex):\n #print nltk.sem.relextract.show_raw_rtuple(rel)\n alist.append(nltk.sem.relextract.show_raw_rtuple(rel)) \n return alist ", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Tester Function to append lists if needed \ndef append_rels(lists_to_append):\n master = []\n for i in lists_to_append:\n for rel in i: \n master.append(i)\n return master", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Print function \ndef print_rels(rels):\n print \"length of list: \",len(rels)\n print \"=\" * 125 , \"\\n\"\n for i in rels:\n print i ", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#####Testing Individual Regex Patterns to add to master#########", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Task1: Genders of who is being married\n#The bride is being extracted - related regex\nbride = []\nDAUGHTER = re.compile(r'.*\\bdaughter\\b')\nbride = extract_rels(tagged_1984, bride, 'PERSON','PERSON',DAUGHTER)\nprint_rels(bride)", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#OPEN QUESTION: Am I supposed to check which ones got picked and then append it to the bride list???\n#Task 1 - continued , extracting the bride \n\n#Create a list \nmarries = []\n\n#Define Regex \nMARRIES = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\nWED = re.compile(r'.*\\b[Ww]eds?\\b')\nENAGEGEMENT = re.compile(r'.*\\b([Ee]ngaged|[Ee]ngagement|[Ee]gagment\\b)')\n\n#Run Relationship Extraction Function \nmarries1 = extract_rels(tagged_1984, marries, 'PERSON','ORGANIZATION',MARRIES)\nmarries2 = extract_rels(tagged_1984, marries, 'PERSON','PERSON',MARRIES)\nmarries3 = extract_rels(tagged_1984, marries, 'PERSON','PERSON',WED)\nmarries4 = extract_rels(tagged_1984, marries, 'PERSON','PERSON',ENAGEGEMENT)\n\n# Append Lists above \nlists = [marries1,marries2,marries3,marries4]\nmaster_marries = append_rels(lists)\nprint len(master_marries)\nprint '=' *100\n#Print Lists\nfor i in master_marries:\n for j in i: \n print j , \"\\n\"", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Task1: Genders of who is being married\n#The groom is being extracted - related regex\ngroom = []\nSON = re.compile(r'.*\\bson\\b')\ngroom = extract_rels(tagged_1984, groom, 'PERSON','PERSON',SON)\nprint_rels(groom)", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Task 2 - Hometowns of whom is being married \nhometown = []\nOF = re.compile(r'.*\\bof\\b')\nhometown = extract_rels(march1984_tagged, hometown,'PERSON','LOCATION',OF)\nprint_rels(hometown)\n# strip out the false positives ", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "profession = []\nPROF = re.compile(r'.*\\b([Pp]rofessor|[Bb]anker|[Pp]rogrammer|[Aa]nalyst|[Aa]ssociate|[Hh]ead[master?]|[Cc]onsultant|[Cc]hairman|[Dd]octorate|[Aa]ccountant|[Ff]reelance|[Pp]artner|[Mm]anager|[Tt]eacher|[Ll]awyer|[Pp]resident|[Dd]ean|[Ee]ngineer|[Aa]ssistant|[Dd]irector|[Ee]xecutive)\\b')\nprofession = extract_rels(tagged_1984, profession,'PERSON','ORGANIZATION',PROF)\nprint_rels(profession)", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#larger funnel\nprofession_v2 = []\nIS = re.compile(r'.*\\bis\\b')\nprofession_v2 = extract_rels(tagged_1984, profession_v2,'PERSON','ORGANIZATION',IS)\nprint_rels(profession_v2)", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "education = []\nEDU = re.compile(r'.*\\b([Dd]egree|[M.B.A.]|[M.S.]|[M.D.]|[Dd]esigner|[Mm]aster\\'s|[Gg]raduate[d]?|complet[ing|ed|e])\\b')\neducation = extract_rels(tagged_1984, education,'PERSON','ORGANIZATION',EDU)\nprint_rels(education)", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#larger funnel\neducation_v2 = []\nFROM = re.compile(r'.*\\bfrom\\b')\neducation_v2 = extract_rels(tagged_1984, education_v2,'PERSON','ORGANIZATION',FROM)\nprint_rels(education_v2)", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Marries Extractors", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "marries = []\nMARRIES = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\n\nfor rel in nltk.sem.extract_rels('PERSON','ORGANIZATION', tree, pattern = MARRIES):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "MARRIES = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\n\nfor rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = MARRIES):\n# print nltk.sem.relextract.show_raw_rtuple(rel)\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "WED = re.compile(r'.*\\b[Ww]eds?\\b')\n\nfor rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = WED):\n# print nltk.sem.relextract.show_raw_rtuple(rel)\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "ENAGEGEMNT = re.compile(r'.*\\b([Ee]ngaged|[Ee]ngagement|[Ee]gagment\\b)')\n\nfor rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = MARRIES):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "AT = re.compile(r'.*\\b[Aa]t\\b')\n\nfor rel in nltk.sem.extract_rels('PERSON','LOCATION', tree, pattern = AT):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Putting it all together:", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Set variable to collect info\nmarries = []\n\nfor doc in march1984_tagged:\n #Parse every document \n tree = chunker_rules(doc)\n #Relationship Extractors - #1\n MARRIES = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\n for rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = MARRIES):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))\n #Relationship Extractors - #2\n MARRIESv2 = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\n for rel in nltk.sem.extract_rels('PERSON','ORGANIZATION', tree, pattern = MARRIESv2):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))\n #Relationship Extractors - #3\n WED = re.compile(r'.*\\b[Ww]eds?\\b')\n for rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = WED):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))\n #Relationship Extractors - #4\n ENGAGEMENT = re.compile(r'.*\\b([Ee]ngaged|[Ee]ngagement|[Ee]gagment\\b)')\n for rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = ENGAGEMENT):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))\n ", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "print len(marries)\n\nfor i in marries:\n print i + '\\n'", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Set variable to collect info\nmarriage_location = []\n\nfor doc in march1984_tagged:\n #Parse every document \n tree = chunker_rules(doc)\n #Relationship Extractors - #1\n AT = re.compile(r'.*\\b[Aa]t\\b')\n for rel in nltk.sem.extract_rels('PERSON','ORGANIZATION', tree, pattern = AT):\n marriage_location.append(nltk.sem.relextract.show_raw_rtuple(rel))\n for rel in nltk.sem.extract_rels('LOCATION','ORGANIZATION', tree, pattern = AT):\n marriage_location.append(nltk.sem.relextract.show_raw_rtuple(rel))\n for rel in nltk.sem.extract_rels('PERSON','LOCATION', tree, pattern = AT):\n marriage_location.append(nltk.sem.relextract.show_raw_rtuple(rel))\n for rel in nltk.sem.extract_rels('LOCATION','PERSON', tree, pattern = AT):\n marriage_location.append(nltk.sem.relextract.show_raw_rtuple(rel))", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "for i in marriage_location:\n print i + '\\n'", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "marriage_location_dict = {'marriage_loc':[]}\nloc = []\nother = []\nfor i in marriage_location:\n if 'performed' in i or 'arrie' in i or 'Weds' in i or 'officiate' in i or 'Temple' in i or 'Church' in i :\n# if 'Church' in i or 'Temple' in i :\n loc.append(i)\n marriage_location_dict['marriage_loc'].append(i)\n else:\n other.append(i)", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "print len(marries)\nprint len (marriage_location)\nprint len(loc)\nprint len(other)", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "for i in loc:\n print i + '\\n'", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "len(marriage_location_dict['marriage_loc'])", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "print marriage_location_dict", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
} | |
], | |
"metadata": {} | |
} | |
], | |
"metadata": { | |
"name": "", | |
"signature": "sha256:3ecf8aab785c892aae24f34a1ca226d4b63a481d39098d3c4a3c45581cdca0f3", | |
"gist_id": "f18f7431fc48d542efe7" | |
}, | |
"nbformat": 3 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment