Created
December 19, 2014 00:53
-
-
Save fayeip/3440ce9537e568d8f743 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "import nltk\nfrom nltk.corpus import PlaintextCorpusReader\nimport re\nfrom itertools import chain\nfrom nltk import tokenize\nfrom nltk.corpus import stopwords\nimport nltk.data\nimport json\nimport pdb\nfrom collections import defaultdict", | |
"prompt_number": 1, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Importing corpus\ncorpus_root = 'data'\nwordlists = PlaintextCorpusReader(corpus_root, '.*\\\\.txt')\nsent_detector = nltk.data.load('tokenizers/punkt/english.pickle')", | |
"prompt_number": 2, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "heading", | |
"source": "Part 1: Preprocessing and cleaning data", | |
"level": 3 | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Create date regex parameters \ndate_pattern = '((J(anuary|u(ne|ly))|February|Ma(rch|y)|A(pril|ugust)|(((Sept|Nov|Dec)em)|Octo)ber).*([0-9]))'\nmp = '(J(anuary|u(ne|ly))|February|Ma(rch|y)|A(pril|ugust)|(((Sept|Nov|Dec)em)|Octo)ber)'\nyp = '[0-9]{4}'", | |
"prompt_number": 3, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "# Testing the patterns \ntest = \"This is the month of November 9, 2014\"\ndate = re.search(date_pattern,test)\nm = re.search(mp,date.group(0))\nmonth = m.group(0)\ny = re.search(yp,date.group(0))\nyear = y.group(0)", | |
"prompt_number": 4, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Set up the dictionaries \ncorpus_dict = {}\n\n#Putting it all together\nfor fileid in wordlists.fileids():\n #Part 1: split of xx of DOCUMENTS \n doc_list = re.split('((?m)^\\\\s+[0-9]+\\\\s*of\\\\s*[0-9]+\\\\s+DOCUMENTS)', wordlists.raw(fileid))\n doc_list.pop(0) #got rid of garbage first empty line\n master_list = list() # put all documents by id, header, footer\n #print len(doc_list) # keep for testing -- how many documents within a single file \n \n #Part 2: split into id, head and footer and create a triple tuple \n for idx in range(0, len(doc_list), 2):\n # add a new tuple of id, header, footer\n # split condition in order of importance\n split_conds = ['words\\r\\n\\r\\n', 'Edition\\r\\n\\r\\n', 'Society Desk\\r\\n\\r\\n','Society Desk\\r\\n\\r\\n\\r\\n','DATELINE: Camden, Me.,\\r\\n\\r\\n\\r\\n']\n doc_split = []\n for cond in split_conds:\n doc_split = re.split(cond,doc_list[idx+1], 1)\n if len(doc_split) == 2:\n break\n #Part 2 contd: Error check to see if any of the splits didn't go through \n if len(doc_split) < 2:\n doc_parts = (doc_list[idx], doc_split)\n print \"too few traces\"\n pdb.set_trace()\n elif len(doc_split) > 2:\n print \"too many splits\"\n else:\n doc_parts = (doc_list[idx], doc_split[0], doc_split[1])\n# print doc_split[0]\n# print '<><><><><><><><><>'\n# print doc_split[1]\n# print \"****************************************\"\n master_list.append(doc_parts) #Create that tuple triple \n \n year_counter = []\n #Part 3: Read the header and extract date \n for doc in master_list:\n #Part 3 a: Header cleaning steps \n clean_header = re.sub(r\"\\b(The New York Times|(DATELINE:.*)|(BYLINE.*)|(.*Correction Appended.*)|(SECTION:.*)|(LENGTH:.*)|(LOAD-DATE:.*)|(http:.*)|(LANGUAGE:.*)|(GRAPHIC:.*)|(Copyright.*)|(Late Edition - Final.*))\\b\", \"\", doc[1])\n clean_header = clean_header.replace(\"\\r\",\"\").strip()\n clean_header = [x for x in clean_header.split('\\n') if any(x.isalnum() for x in x)]\n header_final = ' '.join(clean_header)\n\n #Part 3b: Extracting the date\n date = re.search(date_pattern,header_final)\n m = re.search(mp,date.group(0))\n month = m.group(0)\n y = re.search(yp,date.group(0))\n year = y.group(0)\n year_counter.append(year) \n\n if \"Events\" not in header_final:\n body = doc[2]\n clean_sent = re.sub(r\"\\b(The New York Times|(DATELINE:.*)|(SECTION:.*)|(LENGTH:.*)|(LOAD-DATE:.*)|(http:.*)|(URL:.*)|(LANGUAGE:.*)|(PUBLICATION.*)|(GRAPHIC:.*)|(Copyright.*))\\b\", \"\", body)\n body = re.sub('\\r\\n(?!\\r\\n)', ' ',clean_sent)\n\n #Part 4 adding to the dictionary\n corpus_dict.setdefault(year,{}).setdefault(month, []).append((doc[0],header_final,body)) \n \n#Part 5: Write to a JSON file \nwith open('data/dict2014.json', 'wb') as fp:\n json.dump(corpus_dict, fp)\n ", | |
"prompt_number": 6, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "heading", | |
"source": "Part 2: NER Tagging and Chunking", | |
"level": 3 | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Download Stanford NER taggers\nfrom nltk.tag.stanford import POSTagger\nfrom nltk.tag.stanford import NERTagger\npost = POSTagger('lib/stanford-postagger-2014-10-26/models/english-bidirectional-distsim.tagger',\n 'lib/stanford-postagger-2014-10-26/stanford-postagger.jar', 'utf-8')\n\nnert = NERTagger('lib/stanford-ner-2014-10-26/classifiers/english.all.3class.distsim.crf.ser.gz',\n 'lib/stanford-ner-2014-10-26/stanford-ner.jar', 'utf-8')", | |
"prompt_number": 303, | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Load the entity tagged file as a tuple of tuples \nfrom ast import literal_eval\n\ntagged_1984 = []\n\nwith open('data_tagged/1984_tagged.txt', 'r') as f:\n for line in f:\n line.split(',')\n tagged_1984.append(literal_eval(line.strip()))\n", | |
"prompt_number": 9, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "# Import the RegexpParser\nfrom nltk.chunk import RegexpParser\n\n# Define custom tagged entities - group NE's together \n\ndef chunker_rules(values):\n # Define custom grammar (modified to be a valid regex).\n grammar = r'''\n PERSON:\n {<PERSON><O><PERSON>+}\n {<PERSON>+}\n ORGANIZATION: \n {<ORGANIZATION>+}\n LOCATION: \n {<LOCATION>+}\n WIDOW:\n {<W>}\n\n DIVORCED:\n {<D>} \n GROOM:\n {<G>}\n BRIDE:\n {<B>}\n RELIGIOUS:\n {<R><PERSON>+<O>+<LOCATION>}\n\n '''\n cp = nltk.RegexpParser(grammar) # Create an instance of your custom parser.\n return cp.parse(values) # Parse!\n\ndef entity_chunker(tagged_docs):\n chunks = []\n for doc in tagged_docs:\n tree = chunker_rules(doc)\n for subtree in tree.subtrees():\n if (subtree.node == 'WIDOW'):\n leaflist = [leaf[0] for leaf in subtree.leaves()]\n chunks.append(' '.join(leaflist))\n return chunks\n ", | |
"prompt_number": 7, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "# print len(entity_chunker(tagged_1984))", | |
"prompt_number": 11, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Customizing the tagger\n#assigning a custom tag in the word,tag \n\nfrom ast import literal_eval\n\ndef alter_source (sourcefile):\n f_before = open(sourcefile,'r')\n f_before_str = f_before.read()\n f_before.close()\n\n \"\"\"Customizing the tags labeled 'O'\n 1) Widow , Widower, Widowed >> label: W for widow\n 2) Mr., Mrs., Adm., Sgt., Dr. >> label: PERSON\n 3) Rev., Rabbi, priest >> label: R for religious \n 4) bride >> label: B for bride \n 5) bridegroom, groom >> label: G for groom\n\n \"\"\"\n\n f_after_str = ''\n # Adding the custom tag set 1 - widow\n f_after_str_1 = re.sub(r\"\\(\\'widow\\', \\'O\\'\\)\", \"('widow', 'W')\",f_before_str)\n f_after_str_2 = re.sub(r\"\\(\\'widower\\', \\'O\\'\\)\", \"('widower', 'W')\",f_after_str_1)\n f_after_str_3 = re.sub(r\"\\(\\'widowed\\', \\'O\\'\\)\", \"('widowed', 'W')\",f_after_str_2)\n\n #Adding the custom tag set 2 - person \n f_after_str_4 = re.sub(r\"\\(\\'Mr.\\', \\'O\\'\\)\", \"('Mr.', 'PERSON')\",f_after_str_3)\n f_after_str_5 = re.sub(r\"\\(\\'Mrs.\\', \\'O\\'\\)\", \"('Mrs.', 'PERSON')\",f_after_str_4)\n f_after_str_6 = re.sub(r\"\\(\\'Adm.\\', \\'O\\'\\)\", \"('Adm.', 'PERSON')\",f_after_str_5)\n f_after_str_7 = re.sub(r\"\\(\\'Sgt.\\', \\'O\\'\\)\", \"('Sgt.', 'PERSON')\",f_after_str_6)\n f_after_str_8 = re.sub(r\"\\(\\'Dr.\\', \\'O\\'\\)\", \"('Dr.', 'PERSON')\",f_after_str_7)\n\n\n #Adding the custom tag set 3 - religious head \n f_after_str_9 = re.sub(r\"\\(\\'Rev.\\', \\'O\\'\\)\", \"('Rev.', 'R')\",f_after_str_8)\n f_after_str_10 = re.sub(r\"\\(\\'\\bRabbi\\b\\', \\'O\\'\\)\", \"('Rabbi', 'R')\",f_after_str_9)\n f_after_str_11 = re.sub(r\"\\(\\'\\bpriest\\b\\', \\'O\\'\\)\", \"('priest','R')\",f_after_str_10)\n\n # Adding the custom tag set 4 - divorced\n f_after_str_12 = re.sub(r\"\\(\\'\\bdivorce\\b\\', \\'O\\'\\)\", \"('divorce', 'D')\",f_after_str_11)\n f_after_str_13 = re.sub(r\"\\(\\'\\bdivorced\\b\\', \\'O\\'\\)\", \"('divorced', 'D')\",f_after_str_12)\n\n # Adding the custom tag set 4 - divorced\n f_after_str_14 = re.sub(r\"\\(\\'divorce\\', \\'O\\'\\)\", \"sufia\", f_after_str_13)\n f_after_str_15 = re.sub(r\"\\(\\'divorced\\', \\'O\\'\\)\", \"('divorced', 'D')\",f_after_str_14)\n\n # Adding the custom tag set 5 - bride\n f_after_str_16 = re.sub(r\"\\(\\'\\bbride\\b\\', \\'O\\'\\)\", \"('bride', 'B')\",f_after_str_15)\n\n # Adding the custom tag set 6 - bridegroom\n f_after_str_17 = re.sub(r\"\\(\\'\\bbridegroom\\b\\', \\'O\\'\\)\", \"('bridegroom', 'G')\",f_after_str_16)\n f_after_str_final = re.sub(r\"\\(\\'\\bgroom\\b\\', \\'O\\'\\)\", \"('groom', 'G')\",f_after_str_17)\n \n return f_after_str_final\n\n\ndef apply_custom_tags (targetfile, custom_tags):\n f = open(targetfile,'w')\n f.write(custom_tags)\n f.close()\n\n custom_tag_list = []\n\n with open(targetfile, 'r') as g:\n for line in g:\n line.split('\\n')\n custom_tag_list.append(literal_eval(line.strip()))\n return custom_tag_list", | |
"prompt_number": 32, | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Run custom tagger for all 5 years\n\ncustom_tagged_1984 = apply_custom_tags('data_tagged/1984_tagged_custom.txt',alter_source('data_tagged/1984_tagged.txt'))\ncustom_tagged_1990 = apply_custom_tags('data_tagged/1990_tagged_custom.txt',alter_source('data_tagged/1990_tagged.txt'))\ncustom_tagged_2000 = apply_custom_tags('data_tagged/2000_tagged_custom.txt',alter_source('data_tagged/2000_tagged.txt'))\ncustom_tagged_2010 = apply_custom_tags('data_tagged/2010_tagged_custom.txt',alter_source('data_tagged/2010_tagged.txt'))\ncustom_tagged_2014 = apply_custom_tags('data_tagged/2014_tagged_custom.txt',alter_source('data_tagged/2014_tagged.txt'))\n\n", | |
"prompt_number": 186, | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "heading", | |
"source": "Part 3: Creating dictionaries to store extracted data", | |
"level": 3 | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Staging the dictionaries\nrel_dict_1984 = defaultdict(dict)\nrel_dict_1990 = defaultdict(dict)\nrel_dict_2000 = defaultdict(dict)\nrel_dict_2010 = defaultdict(dict)\nrel_dict_2014 = defaultdict(dict)", | |
"prompt_number": 421, | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "def lift_names_remove_tags(relation, extracted_relation_str):\n \n if relation == 'DAUGHTER' or relation == 'SON':\n rels_pattern = r\"\\[PER: (.*)\\] (.*) \\[PER: (.*)\\]\"\n name_pattern = re.compile(r\"(\\w*\\.?)\\/PERSON\")\n names_search = re.search(rels_pattern, extracted_relation_str)\n bride_or_groom_name = name_pattern.findall(names_search.group(1))\n parents_name = name_pattern.findall(names_search.group(3))\n bride_or_groom_name_str = ''\n parents_name_str = ''\n for bg in bride_or_groom_name:\n bride_or_groom_name_str += bg + \" \"\n for p in parents_name:\n parents_name_str += p + \" \"\n\n return bride_or_groom_name_str.strip(), parents_name_str.strip()", | |
"prompt_number": 173, | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "def extract_rels(rel_dict, dict_key, relKey, rel1, rel2, tree,regex): \n \"\"\"This function extracts the relationships\n Function Outputs:\n final output = {dict_key: [{relKey:[relationship extracted]}, {relKey:[relationship extracted]}]}\n example = {1:[{bride:['Mary Flyn marries John Mayer],{groom: ['John is a son of Mr and Mrs Mayer]}}]}\n \n Function Inputs:\n 1) rel_dict = This is the default dict that will contain all the patterns in a dictionary per wedding announcement\n 2) dict_key = This is basically a counter per wedding announcment \n 3) relKey = This is the second key i.e. the relationship type you want to get values for \n 4) rel1 , rel2, regex = 'PERSON' [the word \"marries\"] 'PERSON'\n 5) tree = the parsed tree\n \"\"\" \n for rel in nltk.sem.extract_rels(rel1, rel2, tree, pattern = regex):\n dict_values = []\n dict_values.append(nltk.sem.relextract.show_raw_rtuple(rel))\n rel_dict[str(dict_key)][relKey] = dict_values", | |
"prompt_number": 174, | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "def make_rels_dict(tagged_data, rel_dict, relKey, rel1,rel2,regex):\n \"\"\"This function makes the dictionary for the relationships you want to extract -- \n read comments in function \"extract_rels\" for more context\"\"\"\n dict_key = 1\n \n for doc in tagged_data:\n tree = chunker_rules(doc)\n extract_rels (rel_dict, dict_key, relKey, rel1, rel2, tree,regex)\n dict_key +=1\n return rel_dict", | |
"prompt_number": 175, | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "DAUGHTER = re.compile(r'.*\\bdaughter\\b')\n\"\"\"Create an entry in a dictionary for the bride based on the pattern called DAUGHTER\"\"\"\n\nmake_rels_dict(custom_tagged_1984,rel_dict_1984,'bride','PERSON','PERSON',DAUGHTER)\nmake_rels_dict(custom_tagged_1990,rel_dict_1990,'bride','PERSON','PERSON',DAUGHTER)\nmake_rels_dict(custom_tagged_2000,rel_dict_2000,'bride','PERSON','PERSON',DAUGHTER)\nmake_rels_dict(custom_tagged_2010,rel_dict_2010,'bride','PERSON','PERSON',DAUGHTER)\nmake_rels_dict(custom_tagged_2014,rel_dict_2014,'bride','PERSON','PERSON',DAUGHTER)\nprint \"Bride key has been added to master dict\"", | |
"prompt_number": 422, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "Bride key has been added to master dict\n" | |
} | |
], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "test = rel_dict['347']['bride']\nprint test", | |
"prompt_number": 128, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "[\"[PER: 'Leicia/PERSON Sharon/PERSON Osborne/PERSON'] ',/O the/O daughter/O of/O' [PER: 'Mr./PERSON and/O Mrs./PERSON Philip/PERSON Barry/PERSON Osborne/PERSON']\"]\n" | |
} | |
], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "lift_names_remove_tags('DAUGHTER', test[0])", | |
"prompt_number": 129, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 129, | |
"metadata": {}, | |
"text": "('Leicia Sharon Osborne', 'Mr. Mrs. Philip Barry Osborne')" | |
} | |
], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "SON = re.compile(r'.*\\bson\\b')\n\"\"\"Create an entry in a dictionary for the groom based on the pattern called SON\"\"\"\n\nmake_rels_dict(custom_tagged_1984,rel_dict_1984,'groom','PERSON','PERSON',SON) \nmake_rels_dict(custom_tagged_1990,rel_dict_1990,'groom','PERSON','PERSON',SON) \nmake_rels_dict(custom_tagged_2000,rel_dict_2000,'groom','PERSON','PERSON',SON) \nmake_rels_dict(custom_tagged_2010,rel_dict_2010,'groom','PERSON','PERSON',SON) \nmake_rels_dict(custom_tagged_2014,rel_dict_2014,'groom','PERSON','PERSON',SON) \nprint \"Groom key has been added to master dict\"", | |
"prompt_number": 423, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "Groom key has been added to master dict\n" | |
} | |
], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "groom_test = rel_dict['347']['groom']\nprint groom_test", | |
"prompt_number": 132, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "[\"[PER: 'Michael/PERSON Anthony/PERSON Milano/PERSON'] ',/O a/O son/O of/O' [PER: 'Mr./PERSON and/O Mrs./PERSON John/PERSON A./PERSON Milano/PERSON']\"]\n" | |
} | |
], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "lift_names_remove_tags('SON', groom_test[0])", | |
"prompt_number": 133, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 133, | |
"metadata": {}, | |
"text": "('Michael Anthony Milano', 'Mr. Mrs. John A. Milano')" | |
} | |
], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Task 2 - Hometowns of whom is being married \nOF = re.compile(r'.*\\bof\\b')\n\"\"\"Create an entry in a dictionary for the marriage location based on the pattern called DAUGHTER\"\"\"\n\nmake_rels_dict(custom_tagged_1984,rel_dict_1984,'hometowns','PERSON','LOCATION',OF) \nmake_rels_dict(custom_tagged_1990,rel_dict_1990,'hometowns','PERSON','LOCATION',OF) \nmake_rels_dict(custom_tagged_2000,rel_dict_2000,'hometowns','PERSON','LOCATION',OF) \nmake_rels_dict(custom_tagged_2010,rel_dict_2010,'hometowns','PERSON','LOCATION',OF) \nmake_rels_dict(custom_tagged_2014,rel_dict_2014,'hometowns','PERSON','LOCATION',OF) \nprint \"Hometowns key has been added to master dict\"", | |
"prompt_number": 424, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "Hometowns key has been added to master dict\n" | |
} | |
], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "IS = re.compile(r'.*\\bis\\b')\n\"\"\"Create an entry in a dictionary for profession_v2 based on the pattern called IS\"\"\"\n\nmake_rels_dict(custom_tagged_1984,rel_dict_1984,'profession_v2','PERSON','ORGANIZATION',IS)\nmake_rels_dict(custom_tagged_1990,rel_dict_1990,'profession_v2','PERSON','ORGANIZATION',IS)\nmake_rels_dict(custom_tagged_2000,rel_dict_2000,'profession_v2','PERSON','ORGANIZATION',IS)\nmake_rels_dict(custom_tagged_2010,rel_dict_2010,'profession_v2','PERSON','ORGANIZATION',IS)\nmake_rels_dict(custom_tagged_2014,rel_dict_2014,'profession_v2','PERSON','ORGANIZATION',IS)\nprint \"Professions version 2 (example: Mary May is a teacher at Riverdale School) key has been added to master dict\"", | |
"prompt_number": 425, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "Professions version 2 (example: Mary May is a teacher at Riverdale School) key has been added to master dict\n" | |
} | |
], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "EDU = re.compile(r'.*\\b([Dd]egree|[M.B.A.]|[M.S.]|[M.D.]|[Dd]esigner|[Mm]aster\\'s|[Gg]raduated?|[Ee]nrolled?|complet[ing|ed|e])\\b')\n\"\"\"Create an entry in a dictionary for education based on the pattern called EDU\"\"\"\n\nmake_rels_dict(custom_tagged_1984,rel_dict_1984,'education','PERSON','ORGANIZATION',EDU)\nmake_rels_dict(custom_tagged_1990,rel_dict_1990,'education','PERSON','ORGANIZATION',EDU)\nmake_rels_dict(custom_tagged_2000,rel_dict_2000,'education','PERSON','ORGANIZATION',EDU)\nmake_rels_dict(custom_tagged_2010,rel_dict_2010,'education','PERSON','ORGANIZATION',EDU)\nmake_rels_dict(custom_tagged_2014,rel_dict_2014,'education','PERSON','ORGANIZATION',EDU)\nprint \"Education key has been added to master dict\"", | |
"prompt_number": 426, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "Education key has been added to master dict\n" | |
} | |
], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "\"\"\"Extract values from the education and profession keys from the reldicts for analysis\"\"\"\n\ndef extract_relsdict_values (rel_dict):\n range_reldict = len(rel_dict.keys())\n edu_values = []\n prof_values = []\n hometown_values = []\n \n for idx in range (0,range_reldict):\n if rel_dict[str(idx)].has_key('education'):\n edu_values.append(rel_dict[str(idx)]['education'])\n elif rel_dict[str(idx)].has_key('profession_v2'):\n prof_values.append(rel_dict[str(idx)]['profession_v2'])\n\n \n edu_values_flat = flatten(edu_values)\n prof_values_flat = flatten(prof_values)\n return edu_values_flat, prof_values_flat", | |
"prompt_number": 453, | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "\"\"\"Extract and format values from the education key from the reldicts for analysis\"\"\"\n\ndef extract_edu (alist):\n final = []\n for i in alist:\n rels_pattern = r\"\\[PER: (.*)\\] (.*) \\[ORG: (.*)\\]\"\n org_pattern = re.compile(r\"(\\w*\\.?)\\/ORGANIZATION\")\n rels_search = re.search(rels_pattern, i)\n if rels_search != None:\n org_name = org_pattern.findall(rels_search.group(3))\n else:\n org_name = \"\"\n \n final.append(\" \".join(org_name))\n return final", | |
"prompt_number": 510, | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "\"\"\"Extract values from the education and profession keys from the reldicts for analysis\"\"\"\n\nedu_values1984, prof_values1984 = extract_relsdict_values(rel_dict_1984)\nedu_values1990, prof_values1990 = extract_relsdict_values(rel_dict_1990)\nedu_values2000, prof_values2000 = extract_relsdict_values(rel_dict_2000)\nedu_values2010, prof_values2010 = extract_relsdict_values(rel_dict_2010)\nedu_values2014, prof_values2014 = extract_relsdict_values(rel_dict_2014)", | |
"prompt_number": 511, | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "\"\"\"Run fd on the education key from the reldicts for analysis\"\"\"\n\nedu1984_fd = nltk.FreqDist(extract_edu(edu_values1984))\nedu1990_fd = nltk.FreqDist(extract_edu(edu_values1990))\nedu2000_fd = nltk.FreqDist(extract_edu(edu_values2000))\nedu2010_fd = nltk.FreqDist(extract_edu(edu_values2010))\nedu2014_fd = nltk.FreqDist(extract_edu(edu_values2014))\n\ncsv_str = \"\"\nfor school,count in edu1984_fd.items()[:20]:\n csv_str += \"1984,\"+str(school)+\",\"+str(count)+\"\\n\"\nfor school,count in edu1990_fd.items()[:20]:\n csv_str += \"1990,\"+str(school)+\",\"+str(count)+\"\\n\"\nfor school,count in edu2000_fd.items()[:20]:\n csv_str += \"2000,\"+str(school)+\",\"+str(count)+\"\\n\"\nfor school,count in edu2010_fd.items()[:20]:\n csv_str += \"2010,\"+str(school)+\",\"+str(count)+\"\\n\"\nfor school,count in edu2014_fd.items()[:20]:\n csv_str += \"2014,\"+str(school)+\",\"+str(count)+\"\\n\"\n\ncsv_file = open('csv_file_schools.csv','w')\ncsv_file.write(csv_str)\ncsv_file.close()", | |
"prompt_number": 519, | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "\"\"\"Extract and format values from the profession key from the reldicts for analysis\"\"\"\n\ndef extract_prof (alist):\n final = []\n for i in alist:\n rels_pattern = r\"(?:\\[PER: (.*)\\]) (.*) (?:\\[ORG: (.*)\\])\"\n org_pattern = re.compile(r'.*\\b([Pp]rofessor|[Bb]anker|[Pp]rogrammer|[Aa]nalyst|[Aa]ssociate|[Hh]ead[master?]|[Cc]onsultant|[Cc]hairman|[Dd]octorate|[Aa]ccountant|[Ff]reelance|[Pp]artner|[Mm]anager|[Tt]eacher|[Ll]awyer|[Pp]resident|[Dd]ean|[Ee]ngineer|[Aa]ssistant|[Dd]irector|[Ee]xecutive)\\b')\n\n rels_search = re.search(rels_pattern, i)\n if rels_search != None:\n org_name = org_pattern.findall(rels_search.group(0))\n else:\n org_name = \"\"\n \n final.append(\" \".join(org_name))\n #print len(final)\n return final", | |
"prompt_number": 496, | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "\"\"\"Run fd on the profession key from the reldicts for analysis\"\"\"\n\nprof1984_fd = nltk.FreqDist(extract_edu(prof_values1984))\nprof1990_fd = nltk.FreqDist(extract_edu(prof_values1990))\nprof2000_fd = nltk.FreqDist(extract_edu(prof_values2000))\nprof2010_fd = nltk.FreqDist(extract_edu(prof_values2010))\nprof2014_fd = nltk.FreqDist(extract_edu(prof_values2014))\n\n\nprint \"1984 - profession fd\"\nprint \"=\" * 100 , '\\n'\nfor i in prof1984_fd.items()[1:21]:\n print i[0] ,i[1] , '\\n'\n\nprint \"1990 - profession fd\"\nprint \"=\" * 100 , '\\n'\nfor i in prof1990_fd.items()[1:21]:\n print i[0] ,'\\n'\n\nprint \"2000 - profession fd\"\nprint \"=\" * 100 , '\\n'\nfor i in prof2000_fd.items()[1:21]:\n print i[0], '\\n'\n\nprint \"2010 - profession fd\"\nprint \"=\" * 100 , '\\n'\nfor i in prof2010_fd.items()[1:21]:\n print i[0] , '\\n'\n\nprint \"2014 - profession fd\"\nprint \"=\" * 100 , '\\n'\nfor i in prof2014_fd.items()[1:21]:\n print i[0] , '\\n'", | |
"prompt_number": 509, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "535\n525" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\n361" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\n106" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\n236" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\n1984 - profession fd\n==================================================================================================== \n\npresident 101 \n\nmanager 43 \n\ndirector 42 \n\nexecutive 25 \n\nassistant 18 \n\nteacher 16 \n\nassociate 15 \n\nconsultant 14 \n\npartner 14 \n\nanalyst 12 \n\nchairman 11 \n\nprofessor 11 \n\nlawyer 5 \n\naccountant 4 \n\nDean 3 \n\nbanker 3 \n\nengineer 3 \n\ndean 2 \n\ndoctorate 1 \n\n1990 - profession fd\n==================================================================================================== \n\npresident \n\ndirector \n\nassociate \n\nmanager \n\nteacher \n\nassistant \n\nexecutive \n\npartner \n\nanalyst \n\nprofessor \n\nconsultant \n\nengineer \n\nchairman \n\ndean \n\nlawyer \n\nExecutive \n\naccountant \n\nProfessor \n\nfreelance \n\nheads \n\n2000 - profession fd\n==================================================================================================== \n\ndirector \n\npresident \n\nmanager \n\nassociate \n\npartner \n\nexecutive \n\nprofessor \n\nanalyst \n\nconsultant \n\nDean \n\nassistant \n\nteacher \n\nbanker \n\nengineer \n\nchairman \n\nlawyer \n\ndean \n\n2010 - profession fd\n==================================================================================================== \n\npresident \n\nmanager \n\nassociate \n\ndirector \n\npartner \n\nanalyst \n\nprofessor \n\nteacher \n\naccountant \n\nconsultant \n\ndoctorate \n\nengineer \n\nexecutive \n\nlawyer \n\n2014 - profession fd\n==================================================================================================== \n\ndirector \n\nmanager \n\nassociate \n\nanalyst \n\npresident \n\nteacher \n\nexecutive \n\nassistant \n\npartner \n\nlawyer \n\nprofessor \n\nconsultant \n\nAssociate \n\nProfessor \n\nchairman \n\ndean \n\n" | |
} | |
], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "print prof1984_fd", | |
"prompt_number": 498, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "<FreqDist: '': 192, 'president': 101, 'manager': 43, 'director': 42, 'executive': 25, 'assistant': 18, 'teacher': 16, 'associate': 15, 'consultant': 14, 'partner': 14, ...>\n" | |
} | |
], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "A = re.compile(r'.*\\ba\\b')\nmake_rels_dict(custom_tagged_1984,rel_dict,'widow','PERSON','W',A)\nprint \"widow key has been added to master dict\"", | |
"prompt_number": 184, | |
"outputs": [ | |
{ | |
"ename": "ValueError", | |
"evalue": "your value for the object type has not been recognized: W", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-184-2a43633947e9>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mA\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mr'.*\\ba\\b'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mmake_rels_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcustom_tagged_1984\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mrel_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'widow'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'PERSON'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'W'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mA\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0;34m\"widow key has been added to master dict\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m<ipython-input-175-c8d1bf77bc45>\u001b[0m in \u001b[0;36mmake_rels_dict\u001b[0;34m(tagged_data, rel_dict, relKey, rel1, rel2, regex)\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mdoc\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtagged_data\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mtree\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mchunker_rules\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdoc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mextract_rels\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mrel_dict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdict_key\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrelKey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrel1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrel2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtree\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mregex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0mdict_key\u001b[0m \u001b[0;34m+=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mrel_dict\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m<ipython-input-174-5bfe82b2bc8b>\u001b[0m in \u001b[0;36mextract_rels\u001b[0;34m(rel_dict, dict_key, relKey, rel1, rel2, tree, regex)\u001b[0m\n\u001b[1;32m 12\u001b[0m 5) tree = the parsed tree\n\u001b[1;32m 13\u001b[0m \"\"\" \n\u001b[0;32m---> 14\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mrel\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mnltk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msem\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextract_rels\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrel1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrel2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtree\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpattern\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mregex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 15\u001b[0m \u001b[0mdict_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0mdict_values\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnltk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msem\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrelextract\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshow_raw_rtuple\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m/Users/sufia/anaconda/lib/python2.7/site-packages/nltk/sem/relextract.pyc\u001b[0m in \u001b[0;36mextract_rels\u001b[0;34m(subjclass, objclass, doc, corpus, pattern, window)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[0msubjclass\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_expand\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msubjclass\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 211\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"your value for the subject type has not been recognized: %s\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0msubjclass\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 212\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mobjclass\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mobjclass\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mNE_CLASSES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0m_expand\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobjclass\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mNE_CLASSES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;31mValueError\u001b[0m: your value for the object type has not been recognized: W" | |
], | |
"output_type": "pyerr" | |
} | |
], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "\"\"\"Code to assess the marriage location in -- here I will try and the relevant results as a dictionary in itself\"\"\"\n\nAT = re.compile(r'.*\\b[Aa]t\\b')\n\ndef wedding_location_finder (tagged_data,regex, mprint=False):\n marriage_location = []\n\n for doc in tagged_data:\n #Parse every document \n tree = chunker_rules(doc)\n #Relationship Extractors - #1\n \n for rel in nltk.sem.extract_rels('PERSON','ORGANIZATION', tree, pattern = regex):\n marriage_location.append(nltk.sem.relextract.show_raw_rtuple(rel))\n for rel in nltk.sem.extract_rels('LOCATION','ORGANIZATION', tree, pattern = regex):\n marriage_location.append(nltk.sem.relextract.show_raw_rtuple(rel))\n for rel in nltk.sem.extract_rels('PERSON','LOCATION', tree, pattern = regex):\n marriage_location.append(nltk.sem.relextract.show_raw_rtuple(rel))\n for rel in nltk.sem.extract_rels('LOCATION','PERSON', tree, pattern = regex):\n marriage_location.append(nltk.sem.relextract.show_raw_rtuple(rel))\n\n # \"Creating a marriage location dictionary\"\n marriage_loc_dict = {'marriage_loc':[]}\n loc = []\n other = []\n\n # \"Running code to restrict the wide net to see where the wedding took place\"\n for i in marriage_location:\n if 'performed' in i or 'arrie' in i or 'Weds' in i or 'officiate' in i or 'Temple' in i or 'Church' in i or 'church' in i:\n loc.append(i)\n marriage_loc_dict['marriage_loc'].append(i)\n else:\n other.append(i)\n \n if mprint == True:\n print \"=\" * 125 \n print \"First pass - regex patterns AT: note - casts a wide net \" , len (marriage_location)\n print \"=\" * 125 , \"\\n\"\n for i in marriage_location[:5]:\n print i + '\\n' \n\n print \"=\" * 125 \n print \"Marriage location - the ones that make it in = \" , len(loc)\n print \"=\" * 125 , \"\\n\"\n for i in loc[:5]:\n print i , '\\n'\n\n print \"=\" * 125 \n print \"Marriage location - the ones that didn't make it in = \" , len(other) \n print \"=\" * 125 , \"\\n\"\n for i in other[:5]:\n print i , '\\n'\n\n return marriage_loc_dict.values()\n \n ", | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "wedding_loc_1984 = wedding_location_finder(custom_tagged_1984,AT, mprint=False)", | |
"prompt_number": 363, | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "**Sample output from wedding_loc_1984 run**\n\n**=============================================================================================================================\nFirst pass - regex patterns AT: note - casts a wide net 1805\n============================================================================================================================**\n\n[PER: 'George/PERSON Eckstein/PERSON'] 'performed/O the/O ceremony/O at/O the/O' [ORG: 'Long/ORGANIZATION Ridge/ORGANIZATION Congregational/ORGANIZATION Church/ORGANIZATION']\n\n[LOC: 'White/LOCATION Plains/LOCATION'] ',/O is/O studying/O for/O a/O M.B.A./O degree/O at/O' [ORG: 'Columbia/ORGANIZATION University/ORGANIZATION']\n\n[LOC: 'N.Y./LOCATION'] ',/O and/O an/O adjunct/O associate/O professor/O at/O the/O' [ORG: 'C.V./ORGANIZATION Starr/ORGANIZATION Center/ORGANIZATION for/ORGANIZATION Applied/ORGANIZATION Economics/ORGANIZATION']\n\n[LOC: 'Stony/LOCATION Brook/LOCATION'] './O Her/O father/O is/O head/O of/O operations/O at/O' [ORG: 'L./ORGANIZATION F./ORGANIZATION Rothschild/ORGANIZATION Unterberg/ORGANIZATION Towbin/ORGANIZATION']\n\n[PER: 'Mr./PERSON Fleming/PERSON'] 'are/O senior/O vice/O presidents/O at/O' [LOC: 'Moseley/LOCATION']\n\n**=============================================================================================================================\nMarriage location - the ones that make it in = 765\n============================================================================================================================**\n\n[PER: 'George/PERSON Eckstein/PERSON'] 'performed/O the/O ceremony/O at/O the/O' [ORG: 'Long/ORGANIZATION Ridge/ORGANIZATION Congregational/ORGANIZATION Church/ORGANIZATION'] \n\n[PER: 'Robert/PERSON Cowperthwaite/PERSON'] 'performed/O the/O Episcopal/O ceremony/O at/O' [ORG: \"St./ORGANIZATION Paul/ORGANIZATION 's/ORGANIZATION Chapel/ORGANIZATION of/ORGANIZATION Trinity/ORGANIZATION Church/ORGANIZATION\"] \n\n[PER: 'Thomas/PERSON D./PERSON Bowers/PERSON'] 'performed/O the/O ceremony/O at/O' [ORG: 'St./ORGANIZATION Bartholomew/ORGANIZATION'] \n\n[PER: 'Clinton/PERSON'] './O The/O nondenominational/O ceremony/O was/O performed/O at/O the/O' [ORG: 'Hamilton/ORGANIZATION College/ORGANIZATION Chapel/ORGANIZATION'] \n\n[PER: 'W./PERSON James/PERSON White/PERSON'] 'performed/O the/O ceremony/O at/O the/O' [ORG: 'United/ORGANIZATION Methodist/ORGANIZATION Church/ORGANIZATION'] \n\n**=============================================================================================================================\nMarriage location - the ones that didn't make it in = 1040\n============================================================================================================================**\n\n[LOC: 'White/LOCATION Plains/LOCATION'] ',/O is/O studying/O for/O a/O M.B.A./O degree/O at/O' [ORG: 'Columbia/ORGANIZATION University/ORGANIZATION'] \n\n[LOC: 'N.Y./LOCATION'] ',/O and/O an/O adjunct/O associate/O professor/O at/O the/O' [ORG: 'C.V./ORGANIZATION Starr/ORGANIZATION Center/ORGANIZATION for/ORGANIZATION Applied/ORGANIZATION Economics/ORGANIZATION'] \n\n[LOC: 'Stony/LOCATION Brook/LOCATION'] './O Her/O father/O is/O head/O of/O operations/O at/O' [ORG: 'L./ORGANIZATION F./ORGANIZATION Rothschild/ORGANIZATION Unterberg/ORGANIZATION Towbin/ORGANIZATION'] \n\n[PER: 'Mr./PERSON Fleming/PERSON'] 'are/O senior/O vice/O presidents/O at/O' [LOC: 'Moseley/LOCATION'] \n\n[PER: 'Susan/PERSON Davis/PERSON Wiltshire/PERSON'] ',/O is/O a/O senior/O consultant/O at/O' [ORG: 'Research/ORGANIZATION and/ORGANIZATION Planning/ORGANIZATION Inc./ORGANIZATION'] " | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "wedding_loc_1990 = wedding_location_finder(custom_tagged_1990,AT, mprint=False)", | |
"prompt_number": 394, | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "wedding_loc_2000 = wedding_location_finder(custom_tagged_2000,AT, mprint=False)", | |
"prompt_number": 395, | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "wedding_loc_2010 = wedding_location_finder(custom_tagged_2010,AT, mprint=False)", | |
"prompt_number": 396, | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "wedding_loc_2014 = wedding_location_finder(custom_tagged_2014,AT, mprint=False)", | |
"prompt_number": 393, | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "from compiler.ast import flatten\nwedding_loc_1984_flat = flatten(wedding_loc_1984)\nwedding_loc_1990_flat = flatten(wedding_loc_1990)\nwedding_loc_2000_flat = flatten(wedding_loc_2000)\nwedding_loc_2010_flat = flatten(wedding_loc_2010)\nwedding_loc_2014_flat = flatten(wedding_loc_2014)", | |
"prompt_number": 397, | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "def extract_wedding_location (wedding_locs):\n final = []\n for i in wedding_locs:\n rels_pattern = r\"\\[PER: (.*)\\] (.*) \\[ORG: (.*)\\]\"\n org_pattern = re.compile(r\"(\\w*\\.?)\\/ORGANIZATION\")\n rels_search = re.search(rels_pattern, i)\n \n if rels_search != None:\n org_name = org_pattern.findall(rels_search.group(3))\n else:\n org_name = \"\"\n \n final.append(\" \".join(org_name))\n return final\n", | |
"prompt_number": 387, | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "pre1_wedding_loc_1984 = extract_wedding_location(wedding_loc_1984_flat)\npre2_wedding_loc_1984 = [nltk.word_tokenize(i) for i in pre1_wedding_loc_1984]\nwedding_loc_1984_final = flatten(pre2_wedding_loc_1984)\n\nwedding_loc_1984_fd = nltk.FreqDist(wedding_loc_1984_final)\nfor i in wedding_loc_1984_fd.items()[:10]:\n print i , '\\n'", | |
"prompt_number": 417, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "('Church', 364) \n\n('St.', 238) \n\n('Episcopal', 159) \n\n('Catholic', 99) \n\n('Roman', 97) \n\n('of', 90) \n\n('s', 68) \n\n('Temple', 48) \n\n('Christ', 39) \n\n('Club', 38) \n\n" | |
} | |
], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "pre1_wedding_loc_1990 = extract_wedding_location(wedding_loc_1990_flat)\npre2_wedding_loc_1990 = [nltk.word_tokenize(i) for i in pre1_wedding_loc_1990]\nwedding_loc_1990_final = flatten(pre2_wedding_loc_1990)\n\nwedding_loc_1990_fd = nltk.FreqDist(wedding_loc_1990_final)\nfor i in wedding_loc_1990_fd.items()[:10]:\n print i , '\\n'", | |
"prompt_number": 398, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "('Church', 454) \n\n('St.', 287) \n\n('Catholic', 151) \n\n('Roman', 151) \n\n('Episcopal', 147) \n\n('of', 109) \n\n('s', 96) \n\n('Club', 93) \n\n('Temple', 69) \n\n('John', 43) \n\n" | |
} | |
], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "pre1_wedding_loc_2000 = extract_wedding_location(wedding_loc_2000_flat)\npre2_wedding_loc_2000 = [nltk.word_tokenize(i) for i in pre1_wedding_loc_2000]\nwedding_loc_2000_final = flatten(pre2_wedding_loc_2000)\n\nwedding_loc_2000_fd = nltk.FreqDist(wedding_loc_2000_final)\nfor i in wedding_loc_2000_fd.items()[:10]:\n print i , '\\n'", | |
"prompt_number": 411, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "('Church', 297) \n\n('St.', 177) \n\n('Catholic', 96) \n\n('Roman', 95) \n\n('of', 84) \n\n('Episcopal', 78) \n\n('Club', 61) \n\n('s', 58) \n\n('John', 23) \n\n('Congregational', 21) \n\n" | |
} | |
], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "pre1_wedding_loc_2010 = extract_wedding_location(wedding_loc_2010_flat)\npre2_wedding_loc_2010 = [nltk.word_tokenize(i) for i in pre1_wedding_loc_2010]\nwedding_loc_2010_final = flatten(pre2_wedding_loc_2010)\n\nwedding_loc_2010_fd = nltk.FreqDist(wedding_loc_2010_final)\nfor i in wedding_loc_2010_fd.items()[:10]:\n print i , '\\n'", | |
"prompt_number": 412, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "('Church', 73) \n\n('St.', 40) \n\n('Club', 38) \n\n('of', 24) \n\n('Catholic', 22) \n\n('House', 15) \n\n('Country', 14) \n\n('Roman', 13) \n\n('s', 11) \n\n('Hotel', 10) \n\n" | |
} | |
], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "pre1_wedding_loc_2014 = extract_wedding_location(wedding_loc_2014_flat)\npre2_wedding_loc_2014 = [nltk.word_tokenize(i) for i in pre1_wedding_loc_2014]\nwedding_loc_2014_final = flatten(pre2_wedding_loc_2014)\n\nwedding_loc_2014_fd = nltk.FreqDist(wedding_loc_2014_final)\nfor i in wedding_loc_2014_fd.items()[:10]:\n print i , '\\n'", | |
"prompt_number": 413, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "('Church', 151) \n\n('St.', 96) \n\n('Club', 56) \n\n('Catholic', 43) \n\n('of', 39) \n\n('House', 26) \n\n('s', 26) \n\n('Roman', 22) \n\n('Episcopal', 16) \n\n('Chapel', 15) \n\n" | |
} | |
], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "# Use functions below to see how well the individual patterns do and then add to the master dictionary", | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "# Tester Function to extract the relationships for individual patterns \ndef test_extract_rels (tagged_data, alist, rel1,rel2,regex):\n for doc in tagged_data:\n tree = chunker_rules(doc)\n for rel in nltk.sem.extract_rels(rel1, rel2, tree, pattern = regex):\n #print nltk.sem.relextract.show_raw_rtuple(rel)\n alist.append(nltk.sem.relextract.show_raw_rtuple(rel)) \n return alist ", | |
"prompt_number": 543, | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Tester Function to append lists if needed \ndef append_rels(lists_to_append):\n master = []\n for i in lists_to_append:\n for rel in i: \n master.append(i)\n return master", | |
"prompt_number": 544, | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Print function \ndef print_rels(rels):\n print \"length of list: \",len(rels)\n print \"=\" * 125 , \"\\n\"\n for i in rels[:5]:\n print i ", | |
"prompt_number": 545, | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#####Testing Individual Regex Patterns to add to master#########", | |
"prompt_number": 538, | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Task1: Genders of who is being married\n#The bride is being extracted - related regex\nbride = []\nDAUGHTER = re.compile(r'.*\\bdaughter\\b')\nbride = test_extract_rels(custom_tagged_1984, bride, 'PERSON','PERSON',DAUGHTER)\nprint_rels(bride)", | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#OPEN QUESTION: Am I supposed to check which ones got picked and then append it to the bride list???\n#Task 1 - continued , extracting the bride \n\n#Create a list \nmarries = []\n\n#Define Regex \nMARRIES = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\nWED = re.compile(r'.*\\b[Ww]eds?\\b')\nENAGEGEMENT = re.compile(r'.*\\b([Ee]ngaged|[Ee]ngagement|[Ee]gagment\\b)')\n\n#Run Relationship Extraction Function \nmarries1 = test_extract_rels(custom_tagged_1984, marries, 'PERSON','ORGANIZATION',MARRIES)\nmarries2 = test_extract_rels(custom_tagged_1984, marries, 'PERSON','PERSON',MARRIES)\nmarries3 = test_extract_rels(custom_tagged_1984, marries, 'PERSON','PERSON',WED)\nmarries4 = test_extract_rels(custom_tagged_1984, marries, 'PERSON','PERSON',ENAGEGEMENT)\n\n# Append Lists above \nlists = [marries1,marries2,marries3,marries4]\nmaster_marries = append_rels(lists)\nprint len(master_marries)\nprint '=' *100\n#Print Lists\nfor i in master_marries[:5]:\n for j in i: \n print j , \"\\n\"", | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Task1: Genders of who is being married\n#The groom is being extracted - related regex\ngroom = []\nSON = re.compile(r'.*\\bson\\b')\ngroom = test_extract_rels(custom_tagged_1984, groom, 'PERSON','PERSON',SON)\nprint_rels(groom)", | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Task 2 - Hometowns of whom is being married \nhometown = []\nOF = re.compile(r'.*\\bof\\b')\nhometown = test_extract_rels(custom_tagged_1984, hometown,'PERSON','LOCATION',OF)\nprint_rels(hometown)\n# strip out the false positives ", | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "profession = []\nPROF = re.compile(r'.*\\b([Pp]rofessor|[Bb]anker|[Pp]rogrammer|[Aa]nalyst|[Aa]ssociate|[Hh]ead[master?]|[Cc]onsultant|[Cc]hairman|[Dd]octorate|[Aa]ccountant|[Ff]reelance|[Pp]artner|[Mm]anager|[Tt]eacher|[Ll]awyer|[Pp]resident|[Dd]ean|[Ee]ngineer|[Aa]ssistant|[Dd]irector|[Ee]xecutive)\\b')\nprofession = test_extract_rels(custom_tagged_1984, profession,'PERSON','ORGANIZATION',PROF)\nprint_rels(profession)", | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#larger funnel\nprofession_v2 = []\nIS = re.compile(r'.*\\bis\\b')\nprofession_v2 = test_extract_rels(custom_tagged_1984, profession_v2,'PERSON','ORGANIZATION',IS)\nprint_rels(profession_v2)", | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "education = []\nEDU = re.compile(r'.*\\b([Dd]egree|[M.B.A.]|[M.S.]|[M.D.]|[Dd]esigner|[Mm]aster\\'s|[Gg]raduated?|[Ee]nrolled|complet[ing|ed|e])\\b')\neducation = test_extract_rels(custom_tagged_1984, education,'PERSON','ORGANIZATION',EDU)\nprint_rels(education)", | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#larger funnel\neducation_v2 = []\nFROM = re.compile(r'.*\\bfrom\\b')\neducation_v2 = test_extract_rels(custom_tagged_1984, education_v2,'PERSON','ORGANIZATION',FROM)\nprint_rels(education_v2)", | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Marries Extractors", | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "marries = []\nMARRIES = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\n\nfor rel in nltk.sem.extract_rels('PERSON','ORGANIZATION', tree, pattern = MARRIES):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))", | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "MARRIES = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\n\nfor rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = MARRIES):\n# print nltk.sem.relextract.show_raw_rtuple(rel)\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))", | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "WED = re.compile(r'.*\\b[Ww]eds?\\b')\n\nfor rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = WED):\n# print nltk.sem.relextract.show_raw_rtuple(rel)\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))", | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "ENAGEGEMNT = re.compile(r'.*\\b([Ee]ngaged|[Ee]ngagement|[Ee]gagment\\b)')\n\nfor rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = MARRIES):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))", | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "AT = re.compile(r'.*\\b[Aa]t\\b')\n\nfor rel in nltk.sem.extract_rels('PERSON','LOCATION', tree, pattern = AT):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))", | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Putting it all together:", | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Set variable to collect info\nmarries = []\n\nfor doc in tagged_1984:\n #Parse every document \n tree = chunker_rules(doc)\n #Relationship Extractors - #1\n MARRIES = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\n for rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = MARRIES):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))\n #Relationship Extractors - #2\n MARRIESv2 = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\n for rel in nltk.sem.extract_rels('PERSON','ORGANIZATION', tree, pattern = MARRIESv2):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))\n #Relationship Extractors - #3\n WED = re.compile(r'.*\\b[Ww]eds?\\b')\n for rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = WED):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))\n #Relationship Extractors - #4\n ENGAGEMENT = re.compile(r'.*\\b([Ee]ngaged|[Ee]ngagement|[Ee]gagment\\b)')\n for rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = ENGAGEMENT):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))\n ", | |
"prompt_number": 577, | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "print len(marries)\n\nfor i in marries[:5]:\n print i + '\\n'", | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#Alternate preprocessing for non-regex extraction\ncorpus_dict2 = {} # Structure is set up to be Year > Month > Wedding Announcements \ntracker2 = {} #to track wedding announcements that enter the dictionary. Some announcements are social events not weddings.\n\nfor fileid in wordlists.fileids():\n # Split up each wedding announcement in the file by the pattern below - '2 of 600 DOCUMENTS'\n doc_list = re.split('((?m)^\\\\s+[0-9]+\\\\s*of\\\\s*[0-9]+\\\\s+DOCUMENTS)', wordlists.raw(fileid)) \n\n num_docs = 0\n for idx in range (2,len(doc_list),2):\n num_docs += 1\n paragraphs = doc_list[idx].split('\\r\\n\\r\\n')\n sents = []\n for i in range(len(paragraphs)): \n paragraphs[i] = re.sub('\\r\\n', ' ', paragraphs[i].strip())\n sents += sent_detector.tokenize(paragraphs[i])\n\n whole_article_string = \" \".join(sents) \n\n #remove article if it has byline because it would not be a wedding announcement\n if \"BYLINE:\" in whole_article_string:\n continue \n #remove article if it is about Events, not weddings\n if \"future events\" in whole_article_string.lower():\n continue \n\n #find date of article \n date = re.search(date_pattern,whole_article_string)\n m = re.search(mp,date.group(0))\n month = m.group(0)\n y = re.search(yp,date.group(0))\n year = y.group(0)\n\n #remove junk lines and add article to dictionary\n good_lines = []\n for sent in sents:\n if len(sent) != 0: \n #remove junk lines \n unwanted_pattern = r\"\\b(^(The New York Times)$|([0-9]{1})-[0-9]{2}$|[JFMASOND]\\w+ [0-9]{1,2}, ([0-9]{4})(,?) Sunday|^(Copyright) [0-9]{4} (The New York Times Company)$|^(DATELINE:.*)|^(SECTION:.*)|^(LENGTH:.*)|^(LOAD-DATE:.*)|(http:.*)|^(PUBLICATION-TYPE:.*)|^(LANGUAGE:.*)|^(GRAPHIC:.*))\\b\" \n junk_line = re.search(unwanted_pattern, sent)\n if junk_line == None:\n \n good_lines.append(re.sub(r\"WEDDINGS/CELEBRATIONS; \", \"\", sent))\n corpus_dict2.setdefault(year,{}).setdefault(month, []).append(good_lines)\n tracker2.setdefault(fileid,[]).append(doc_list[idx-1])\n\n# print corpus_dict['1984']['March']", | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "same_sex_marriages_counts = {} \nsame_sex_marriages_text = {} \nfor year in (2002,2003,2005,2010,2014):\n same_sex_marriages_counts[str(year)] = {'same_sex_announcement': 0, 'non_same_sex_announcement': 0} \n same_sex_marriages_text[str(year)] = {'same_sex_announcement': [], 'non_same_sex_announcement': []} \n for month in corpus_dict2[str(year)].keys():\n for article in corpus_dict2[str(year)][month]:\n article_str = \" \".join(article)\n if 'bride' not in article_str and 'groom' not in article_str:\n if ('daughter' not in article_str and 'son' in article_str) or ('son' not in article_str and 'daughter' in article_str):\n same_sex_marriages_counts[str(year)]['same_sex_announcement'] += 1\n same_sex_marriages_text[str(year)]['same_sex_announcement'].append(article_str)\n elif ('Mr.' not in article_str and 'Ms.' in article_str) or ('Mr.' in article_str and ('Ms.' not in article_str and 'Mrs.' not in article_str)):\n same_sex_marriages_counts[str(year)]['same_sex_announcement'] += 1\n same_sex_marriages_text[str(year)]['same_sex_announcement'].append(article_str) \n else:\n same_sex_marriages_counts[str(year)]['non_same_sex_announcement'] += 1\n same_sex_marriages_text[str(year)]['non_same_sex_announcement'].append(article_str)\n", | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "widow_counts = {} \nwidow_text = {} \nfor year in (1984,1985,1986,1990,1991,1995,2000,2001,2002,2003,2005,2010,2014):\n widow_counts[str(year)] = {'yes': 0, 'no': 0} \n widow_text[str(year)] = {'yes': [], 'no': []} \n for month in corpus_dict2[str(year)].keys():\n for article in corpus_dict2[str(year)][month]:\n article_str = \" \".join(article)\n if 'widow' not in article_str:\n# if ('daughter' not in article_str and 'son' in article_str) or ('son' not in article_str and 'daughter' in article_str):\n widow_counts[str(year)]['no'] += 1\n widow_text[str(year)]['no'].append(article_str)\n# elif ('Mr.' not in article_str and 'Ms.' in article_str) or ('Mr.' in article_str and ('Ms.' not in article_str and 'Mrs.' not in article_str)):\n# same_sex_marriages_counts[str(year)]['same_sex_announcement'] += 1\n# same_sex_marriages_text[str(year)]['same_sex_announcement'].append(article_str) \n else:\n widow_counts[str(year)]['yes'] += 1\n widow_text[str(year)]['yes'].append(article_str)\n", | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "divorce_counts = {} \ndivorce_text = {} \nfor year in (1984,1985,1986,1990,1991,1995,2000,2001,2002,2003,2005,2010,2014):\n divorce_counts[str(year)] = {'yes': 0, 'no': 0} \n divorce_text[str(year)] = {'yes': [], 'no': []} \n for month in corpus_dict2[str(year)].keys():\n for article in corpus_dict2[str(year)][month]:\n article_str = \" \".join(article)\n if 'divorce' not in article_str:\n# if ('daughter' not in article_str and 'son' in article_str) or ('son' not in article_str and 'daughter' in article_str):\n divorce_counts[str(year)]['no'] += 1\n divorce_text[str(year)]['no'].append(article_str)\n# elif ('Mr.' not in article_str and 'Ms.' in article_str) or ('Mr.' in article_str and ('Ms.' not in article_str and 'Mrs.' not in article_str)):\n# same_sex_marriages_counts[str(year)]['same_sex_announcement'] += 1\n# same_sex_marriages_text[str(year)]['same_sex_announcement'].append(article_str) \n else:\n divorce_counts[str(year)]['yes'] += 1\n divorce_text[str(year)]['yes'].append(article_str)\n", | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "keephername_counts = {} \nkeephername_text = {} \nkhn_pattern = r'keep\\w* her name'\nprof_pattern = r'continu\\w* to use her name professionally'\nfor year in (1984,1985,1986,1990,1991,1995,2000,2001,2002,2003,2005,2010,2014):\n keephername_counts[str(year)] = {'yes': 0, 'no': 0} \n keephername_text[str(year)] = {'yes': [], 'no': []} \n for month in corpus_dict2[str(year)].keys():\n for article in corpus_dict2[str(year)][month]:\n article_str = \" \".join(article)\n match1 = re.search(khn_pattern, article_str)\n match2 = re.search(prof_pattern, article_str)\n if match1 != None or match2 != None:\n# if ('daughter' not in article_str and 'son' in article_str) or ('son' not in article_str and 'daughter' in article_str):\n keephername_counts[str(year)]['yes'] += 1\n keephername_text[str(year)]['yes'].append(article_str)\n# elif ('Mr.' not in article_str and 'Ms.' in article_str) or ('Mr.' in article_str and ('Ms.' not in article_str and 'Mrs.' not in article_str)):\n# same_sex_marriages_counts[str(year)]['same_sex_announcement'] += 1\n# same_sex_marriages_text[str(year)]['same_sex_announcement'].append(article_str) \n else:\n keephername_counts[str(year)]['no'] += 1\n keephername_text[str(year)]['no'].append(article_str)\n", | |
"outputs": [], | |
"language": "python", | |
"trusted": false, | |
"collapsed": false | |
} | |
], | |
"metadata": {} | |
} | |
], | |
"metadata": { | |
"name": "", | |
"signature": "sha256:ca007ec134faa279beea9fc48bc2c8bcf287761b77c467f1c555856f119dc22f" | |
}, | |
"nbformat": 3 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment