Skip to content

Instantly share code, notes, and snippets.

@fayeip
Created December 19, 2014 00:53
Show Gist options
  • Save fayeip/3440ce9537e568d8f743 to your computer and use it in GitHub Desktop.
Save fayeip/3440ce9537e568d8f743 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"worksheets": [
{
"cells": [
{
"metadata": {},
"cell_type": "code",
"input": "import nltk\nfrom nltk.corpus import PlaintextCorpusReader\nimport re\nfrom itertools import chain\nfrom nltk import tokenize\nfrom nltk.corpus import stopwords\nimport nltk.data\nimport json\nimport pdb\nfrom collections import defaultdict",
"prompt_number": 1,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Importing corpus\ncorpus_root = 'data'\nwordlists = PlaintextCorpusReader(corpus_root, '.*\\\\.txt')\nsent_detector = nltk.data.load('tokenizers/punkt/english.pickle')",
"prompt_number": 2,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "heading",
"source": "Part 1: Preprocessing and cleaning data",
"level": 3
},
{
"metadata": {},
"cell_type": "code",
"input": "#Create date regex parameters \ndate_pattern = '((J(anuary|u(ne|ly))|February|Ma(rch|y)|A(pril|ugust)|(((Sept|Nov|Dec)em)|Octo)ber).*([0-9]))'\nmp = '(J(anuary|u(ne|ly))|February|Ma(rch|y)|A(pril|ugust)|(((Sept|Nov|Dec)em)|Octo)ber)'\nyp = '[0-9]{4}'",
"prompt_number": 3,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "# Testing the patterns \ntest = \"This is the month of November 9, 2014\"\ndate = re.search(date_pattern,test)\nm = re.search(mp,date.group(0))\nmonth = m.group(0)\ny = re.search(yp,date.group(0))\nyear = y.group(0)",
"prompt_number": 4,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Set up the dictionaries \ncorpus_dict = {}\n\n#Putting it all together\nfor fileid in wordlists.fileids():\n #Part 1: split of xx of DOCUMENTS \n doc_list = re.split('((?m)^\\\\s+[0-9]+\\\\s*of\\\\s*[0-9]+\\\\s+DOCUMENTS)', wordlists.raw(fileid))\n doc_list.pop(0) #got rid of garbage first empty line\n master_list = list() # put all documents by id, header, footer\n #print len(doc_list) # keep for testing -- how many documents within a single file \n \n #Part 2: split into id, head and footer and create a triple tuple \n for idx in range(0, len(doc_list), 2):\n # add a new tuple of id, header, footer\n # split condition in order of importance\n split_conds = ['words\\r\\n\\r\\n', 'Edition\\r\\n\\r\\n', 'Society Desk\\r\\n\\r\\n','Society Desk\\r\\n\\r\\n\\r\\n','DATELINE: Camden, Me.,\\r\\n\\r\\n\\r\\n']\n doc_split = []\n for cond in split_conds:\n doc_split = re.split(cond,doc_list[idx+1], 1)\n if len(doc_split) == 2:\n break\n #Part 2 contd: Error check to see if any of the splits didn't go through \n if len(doc_split) < 2:\n doc_parts = (doc_list[idx], doc_split)\n print \"too few traces\"\n pdb.set_trace()\n elif len(doc_split) > 2:\n print \"too many splits\"\n else:\n doc_parts = (doc_list[idx], doc_split[0], doc_split[1])\n# print doc_split[0]\n# print '<><><><><><><><><>'\n# print doc_split[1]\n# print \"****************************************\"\n master_list.append(doc_parts) #Create that tuple triple \n \n year_counter = []\n #Part 3: Read the header and extract date \n for doc in master_list:\n #Part 3 a: Header cleaning steps \n clean_header = re.sub(r\"\\b(The New York Times|(DATELINE:.*)|(BYLINE.*)|(.*Correction Appended.*)|(SECTION:.*)|(LENGTH:.*)|(LOAD-DATE:.*)|(http:.*)|(LANGUAGE:.*)|(GRAPHIC:.*)|(Copyright.*)|(Late Edition - Final.*))\\b\", \"\", doc[1])\n clean_header = clean_header.replace(\"\\r\",\"\").strip()\n clean_header = [x for x in clean_header.split('\\n') if any(x.isalnum() for x in x)]\n header_final = ' '.join(clean_header)\n\n #Part 3b: Extracting the date\n date = re.search(date_pattern,header_final)\n m = re.search(mp,date.group(0))\n month = m.group(0)\n y = re.search(yp,date.group(0))\n year = y.group(0)\n year_counter.append(year) \n\n if \"Events\" not in header_final:\n body = doc[2]\n clean_sent = re.sub(r\"\\b(The New York Times|(DATELINE:.*)|(SECTION:.*)|(LENGTH:.*)|(LOAD-DATE:.*)|(http:.*)|(URL:.*)|(LANGUAGE:.*)|(PUBLICATION.*)|(GRAPHIC:.*)|(Copyright.*))\\b\", \"\", body)\n body = re.sub('\\r\\n(?!\\r\\n)', ' ',clean_sent)\n\n #Part 4 adding to the dictionary\n corpus_dict.setdefault(year,{}).setdefault(month, []).append((doc[0],header_final,body)) \n \n#Part 5: Write to a JSON file \nwith open('data/dict2014.json', 'wb') as fp:\n json.dump(corpus_dict, fp)\n ",
"prompt_number": 6,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "heading",
"source": "Part 2: NER Tagging and Chunking",
"level": 3
},
{
"metadata": {},
"cell_type": "code",
"input": "#Download Stanford NER taggers\nfrom nltk.tag.stanford import POSTagger\nfrom nltk.tag.stanford import NERTagger\npost = POSTagger('lib/stanford-postagger-2014-10-26/models/english-bidirectional-distsim.tagger',\n 'lib/stanford-postagger-2014-10-26/stanford-postagger.jar', 'utf-8')\n\nnert = NERTagger('lib/stanford-ner-2014-10-26/classifiers/english.all.3class.distsim.crf.ser.gz',\n 'lib/stanford-ner-2014-10-26/stanford-ner.jar', 'utf-8')",
"prompt_number": 303,
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Load the entity tagged file as a tuple of tuples \nfrom ast import literal_eval\n\ntagged_1984 = []\n\nwith open('data_tagged/1984_tagged.txt', 'r') as f:\n for line in f:\n line.split(',')\n tagged_1984.append(literal_eval(line.strip()))\n",
"prompt_number": 9,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "# Import the RegexpParser\nfrom nltk.chunk import RegexpParser\n\n# Define custom tagged entities - group NE's together \n\ndef chunker_rules(values):\n # Define custom grammar (modified to be a valid regex).\n grammar = r'''\n PERSON:\n {<PERSON><O><PERSON>+}\n {<PERSON>+}\n ORGANIZATION: \n {<ORGANIZATION>+}\n LOCATION: \n {<LOCATION>+}\n WIDOW:\n {<W>}\n\n DIVORCED:\n {<D>} \n GROOM:\n {<G>}\n BRIDE:\n {<B>}\n RELIGIOUS:\n {<R><PERSON>+<O>+<LOCATION>}\n\n '''\n cp = nltk.RegexpParser(grammar) # Create an instance of your custom parser.\n return cp.parse(values) # Parse!\n\ndef entity_chunker(tagged_docs):\n chunks = []\n for doc in tagged_docs:\n tree = chunker_rules(doc)\n for subtree in tree.subtrees():\n if (subtree.node == 'WIDOW'):\n leaflist = [leaf[0] for leaf in subtree.leaves()]\n chunks.append(' '.join(leaflist))\n return chunks\n ",
"prompt_number": 7,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "# print len(entity_chunker(tagged_1984))",
"prompt_number": 11,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Customizing the tagger\n#assigning a custom tag in the word,tag \n\nfrom ast import literal_eval\n\ndef alter_source (sourcefile):\n f_before = open(sourcefile,'r')\n f_before_str = f_before.read()\n f_before.close()\n\n \"\"\"Customizing the tags labeled 'O'\n 1) Widow , Widower, Widowed >> label: W for widow\n 2) Mr., Mrs., Adm., Sgt., Dr. >> label: PERSON\n 3) Rev., Rabbi, priest >> label: R for religious \n 4) bride >> label: B for bride \n 5) bridegroom, groom >> label: G for groom\n\n \"\"\"\n\n f_after_str = ''\n # Adding the custom tag set 1 - widow\n f_after_str_1 = re.sub(r\"\\(\\'widow\\', \\'O\\'\\)\", \"('widow', 'W')\",f_before_str)\n f_after_str_2 = re.sub(r\"\\(\\'widower\\', \\'O\\'\\)\", \"('widower', 'W')\",f_after_str_1)\n f_after_str_3 = re.sub(r\"\\(\\'widowed\\', \\'O\\'\\)\", \"('widowed', 'W')\",f_after_str_2)\n\n #Adding the custom tag set 2 - person \n f_after_str_4 = re.sub(r\"\\(\\'Mr.\\', \\'O\\'\\)\", \"('Mr.', 'PERSON')\",f_after_str_3)\n f_after_str_5 = re.sub(r\"\\(\\'Mrs.\\', \\'O\\'\\)\", \"('Mrs.', 'PERSON')\",f_after_str_4)\n f_after_str_6 = re.sub(r\"\\(\\'Adm.\\', \\'O\\'\\)\", \"('Adm.', 'PERSON')\",f_after_str_5)\n f_after_str_7 = re.sub(r\"\\(\\'Sgt.\\', \\'O\\'\\)\", \"('Sgt.', 'PERSON')\",f_after_str_6)\n f_after_str_8 = re.sub(r\"\\(\\'Dr.\\', \\'O\\'\\)\", \"('Dr.', 'PERSON')\",f_after_str_7)\n\n\n #Adding the custom tag set 3 - religious head \n f_after_str_9 = re.sub(r\"\\(\\'Rev.\\', \\'O\\'\\)\", \"('Rev.', 'R')\",f_after_str_8)\n f_after_str_10 = re.sub(r\"\\(\\'\\bRabbi\\b\\', \\'O\\'\\)\", \"('Rabbi', 'R')\",f_after_str_9)\n f_after_str_11 = re.sub(r\"\\(\\'\\bpriest\\b\\', \\'O\\'\\)\", \"('priest','R')\",f_after_str_10)\n\n # Adding the custom tag set 4 - divorced\n f_after_str_12 = re.sub(r\"\\(\\'\\bdivorce\\b\\', \\'O\\'\\)\", \"('divorce', 'D')\",f_after_str_11)\n f_after_str_13 = re.sub(r\"\\(\\'\\bdivorced\\b\\', \\'O\\'\\)\", \"('divorced', 'D')\",f_after_str_12)\n\n # Adding the custom tag set 4 - divorced\n f_after_str_14 = re.sub(r\"\\(\\'divorce\\', \\'O\\'\\)\", \"sufia\", f_after_str_13)\n f_after_str_15 = re.sub(r\"\\(\\'divorced\\', \\'O\\'\\)\", \"('divorced', 'D')\",f_after_str_14)\n\n # Adding the custom tag set 5 - bride\n f_after_str_16 = re.sub(r\"\\(\\'\\bbride\\b\\', \\'O\\'\\)\", \"('bride', 'B')\",f_after_str_15)\n\n # Adding the custom tag set 6 - bridegroom\n f_after_str_17 = re.sub(r\"\\(\\'\\bbridegroom\\b\\', \\'O\\'\\)\", \"('bridegroom', 'G')\",f_after_str_16)\n f_after_str_final = re.sub(r\"\\(\\'\\bgroom\\b\\', \\'O\\'\\)\", \"('groom', 'G')\",f_after_str_17)\n \n return f_after_str_final\n\n\ndef apply_custom_tags (targetfile, custom_tags):\n f = open(targetfile,'w')\n f.write(custom_tags)\n f.close()\n\n custom_tag_list = []\n\n with open(targetfile, 'r') as g:\n for line in g:\n line.split('\\n')\n custom_tag_list.append(literal_eval(line.strip()))\n return custom_tag_list",
"prompt_number": 32,
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Run custom tagger for all 5 years\n\ncustom_tagged_1984 = apply_custom_tags('data_tagged/1984_tagged_custom.txt',alter_source('data_tagged/1984_tagged.txt'))\ncustom_tagged_1990 = apply_custom_tags('data_tagged/1990_tagged_custom.txt',alter_source('data_tagged/1990_tagged.txt'))\ncustom_tagged_2000 = apply_custom_tags('data_tagged/2000_tagged_custom.txt',alter_source('data_tagged/2000_tagged.txt'))\ncustom_tagged_2010 = apply_custom_tags('data_tagged/2010_tagged_custom.txt',alter_source('data_tagged/2010_tagged.txt'))\ncustom_tagged_2014 = apply_custom_tags('data_tagged/2014_tagged_custom.txt',alter_source('data_tagged/2014_tagged.txt'))\n\n",
"prompt_number": 186,
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "heading",
"source": "Part 3: Creating dictionaries to store extracted data",
"level": 3
},
{
"metadata": {},
"cell_type": "code",
"input": "#Staging the dictionaries\nrel_dict_1984 = defaultdict(dict)\nrel_dict_1990 = defaultdict(dict)\nrel_dict_2000 = defaultdict(dict)\nrel_dict_2010 = defaultdict(dict)\nrel_dict_2014 = defaultdict(dict)",
"prompt_number": 421,
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "def lift_names_remove_tags(relation, extracted_relation_str):\n \n if relation == 'DAUGHTER' or relation == 'SON':\n rels_pattern = r\"\\[PER: (.*)\\] (.*) \\[PER: (.*)\\]\"\n name_pattern = re.compile(r\"(\\w*\\.?)\\/PERSON\")\n names_search = re.search(rels_pattern, extracted_relation_str)\n bride_or_groom_name = name_pattern.findall(names_search.group(1))\n parents_name = name_pattern.findall(names_search.group(3))\n bride_or_groom_name_str = ''\n parents_name_str = ''\n for bg in bride_or_groom_name:\n bride_or_groom_name_str += bg + \" \"\n for p in parents_name:\n parents_name_str += p + \" \"\n\n return bride_or_groom_name_str.strip(), parents_name_str.strip()",
"prompt_number": 173,
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "def extract_rels(rel_dict, dict_key, relKey, rel1, rel2, tree,regex): \n \"\"\"This function extracts the relationships\n Function Outputs:\n final output = {dict_key: [{relKey:[relationship extracted]}, {relKey:[relationship extracted]}]}\n example = {1:[{bride:['Mary Flyn marries John Mayer],{groom: ['John is a son of Mr and Mrs Mayer]}}]}\n \n Function Inputs:\n 1) rel_dict = This is the default dict that will contain all the patterns in a dictionary per wedding announcement\n 2) dict_key = This is basically a counter per wedding announcment \n 3) relKey = This is the second key i.e. the relationship type you want to get values for \n 4) rel1 , rel2, regex = 'PERSON' [the word \"marries\"] 'PERSON'\n 5) tree = the parsed tree\n \"\"\" \n for rel in nltk.sem.extract_rels(rel1, rel2, tree, pattern = regex):\n dict_values = []\n dict_values.append(nltk.sem.relextract.show_raw_rtuple(rel))\n rel_dict[str(dict_key)][relKey] = dict_values",
"prompt_number": 174,
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "def make_rels_dict(tagged_data, rel_dict, relKey, rel1,rel2,regex):\n \"\"\"This function makes the dictionary for the relationships you want to extract -- \n read comments in function \"extract_rels\" for more context\"\"\"\n dict_key = 1\n \n for doc in tagged_data:\n tree = chunker_rules(doc)\n extract_rels (rel_dict, dict_key, relKey, rel1, rel2, tree,regex)\n dict_key +=1\n return rel_dict",
"prompt_number": 175,
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "DAUGHTER = re.compile(r'.*\\bdaughter\\b')\n\"\"\"Create an entry in a dictionary for the bride based on the pattern called DAUGHTER\"\"\"\n\nmake_rels_dict(custom_tagged_1984,rel_dict_1984,'bride','PERSON','PERSON',DAUGHTER)\nmake_rels_dict(custom_tagged_1990,rel_dict_1990,'bride','PERSON','PERSON',DAUGHTER)\nmake_rels_dict(custom_tagged_2000,rel_dict_2000,'bride','PERSON','PERSON',DAUGHTER)\nmake_rels_dict(custom_tagged_2010,rel_dict_2010,'bride','PERSON','PERSON',DAUGHTER)\nmake_rels_dict(custom_tagged_2014,rel_dict_2014,'bride','PERSON','PERSON',DAUGHTER)\nprint \"Bride key has been added to master dict\"",
"prompt_number": 422,
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "Bride key has been added to master dict\n"
}
],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "test = rel_dict['347']['bride']\nprint test",
"prompt_number": 128,
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "[\"[PER: 'Leicia/PERSON Sharon/PERSON Osborne/PERSON'] ',/O the/O daughter/O of/O' [PER: 'Mr./PERSON and/O Mrs./PERSON Philip/PERSON Barry/PERSON Osborne/PERSON']\"]\n"
}
],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "lift_names_remove_tags('DAUGHTER', test[0])",
"prompt_number": 129,
"outputs": [
{
"output_type": "pyout",
"prompt_number": 129,
"metadata": {},
"text": "('Leicia Sharon Osborne', 'Mr. Mrs. Philip Barry Osborne')"
}
],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "SON = re.compile(r'.*\\bson\\b')\n\"\"\"Create an entry in a dictionary for the groom based on the pattern called SON\"\"\"\n\nmake_rels_dict(custom_tagged_1984,rel_dict_1984,'groom','PERSON','PERSON',SON) \nmake_rels_dict(custom_tagged_1990,rel_dict_1990,'groom','PERSON','PERSON',SON) \nmake_rels_dict(custom_tagged_2000,rel_dict_2000,'groom','PERSON','PERSON',SON) \nmake_rels_dict(custom_tagged_2010,rel_dict_2010,'groom','PERSON','PERSON',SON) \nmake_rels_dict(custom_tagged_2014,rel_dict_2014,'groom','PERSON','PERSON',SON) \nprint \"Groom key has been added to master dict\"",
"prompt_number": 423,
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "Groom key has been added to master dict\n"
}
],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "groom_test = rel_dict['347']['groom']\nprint groom_test",
"prompt_number": 132,
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "[\"[PER: 'Michael/PERSON Anthony/PERSON Milano/PERSON'] ',/O a/O son/O of/O' [PER: 'Mr./PERSON and/O Mrs./PERSON John/PERSON A./PERSON Milano/PERSON']\"]\n"
}
],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "lift_names_remove_tags('SON', groom_test[0])",
"prompt_number": 133,
"outputs": [
{
"output_type": "pyout",
"prompt_number": 133,
"metadata": {},
"text": "('Michael Anthony Milano', 'Mr. Mrs. John A. Milano')"
}
],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Task 2 - Hometowns of whom is being married \nOF = re.compile(r'.*\\bof\\b')\n\"\"\"Create an entry in a dictionary for the marriage location based on the pattern called DAUGHTER\"\"\"\n\nmake_rels_dict(custom_tagged_1984,rel_dict_1984,'hometowns','PERSON','LOCATION',OF) \nmake_rels_dict(custom_tagged_1990,rel_dict_1990,'hometowns','PERSON','LOCATION',OF) \nmake_rels_dict(custom_tagged_2000,rel_dict_2000,'hometowns','PERSON','LOCATION',OF) \nmake_rels_dict(custom_tagged_2010,rel_dict_2010,'hometowns','PERSON','LOCATION',OF) \nmake_rels_dict(custom_tagged_2014,rel_dict_2014,'hometowns','PERSON','LOCATION',OF) \nprint \"Hometowns key has been added to master dict\"",
"prompt_number": 424,
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "Hometowns key has been added to master dict\n"
}
],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "IS = re.compile(r'.*\\bis\\b')\n\"\"\"Create an entry in a dictionary for profession_v2 based on the pattern called IS\"\"\"\n\nmake_rels_dict(custom_tagged_1984,rel_dict_1984,'profession_v2','PERSON','ORGANIZATION',IS)\nmake_rels_dict(custom_tagged_1990,rel_dict_1990,'profession_v2','PERSON','ORGANIZATION',IS)\nmake_rels_dict(custom_tagged_2000,rel_dict_2000,'profession_v2','PERSON','ORGANIZATION',IS)\nmake_rels_dict(custom_tagged_2010,rel_dict_2010,'profession_v2','PERSON','ORGANIZATION',IS)\nmake_rels_dict(custom_tagged_2014,rel_dict_2014,'profession_v2','PERSON','ORGANIZATION',IS)\nprint \"Professions version 2 (example: Mary May is a teacher at Riverdale School) key has been added to master dict\"",
"prompt_number": 425,
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "Professions version 2 (example: Mary May is a teacher at Riverdale School) key has been added to master dict\n"
}
],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "EDU = re.compile(r'.*\\b([Dd]egree|[M.B.A.]|[M.S.]|[M.D.]|[Dd]esigner|[Mm]aster\\'s|[Gg]raduated?|[Ee]nrolled?|complet[ing|ed|e])\\b')\n\"\"\"Create an entry in a dictionary for education based on the pattern called EDU\"\"\"\n\nmake_rels_dict(custom_tagged_1984,rel_dict_1984,'education','PERSON','ORGANIZATION',EDU)\nmake_rels_dict(custom_tagged_1990,rel_dict_1990,'education','PERSON','ORGANIZATION',EDU)\nmake_rels_dict(custom_tagged_2000,rel_dict_2000,'education','PERSON','ORGANIZATION',EDU)\nmake_rels_dict(custom_tagged_2010,rel_dict_2010,'education','PERSON','ORGANIZATION',EDU)\nmake_rels_dict(custom_tagged_2014,rel_dict_2014,'education','PERSON','ORGANIZATION',EDU)\nprint \"Education key has been added to master dict\"",
"prompt_number": 426,
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "Education key has been added to master dict\n"
}
],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "\"\"\"Extract values from the education and profession keys from the reldicts for analysis\"\"\"\n\ndef extract_relsdict_values (rel_dict):\n range_reldict = len(rel_dict.keys())\n edu_values = []\n prof_values = []\n hometown_values = []\n \n for idx in range (0,range_reldict):\n if rel_dict[str(idx)].has_key('education'):\n edu_values.append(rel_dict[str(idx)]['education'])\n elif rel_dict[str(idx)].has_key('profession_v2'):\n prof_values.append(rel_dict[str(idx)]['profession_v2'])\n\n \n edu_values_flat = flatten(edu_values)\n prof_values_flat = flatten(prof_values)\n return edu_values_flat, prof_values_flat",
"prompt_number": 453,
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "\"\"\"Extract and format values from the education key from the reldicts for analysis\"\"\"\n\ndef extract_edu (alist):\n final = []\n for i in alist:\n rels_pattern = r\"\\[PER: (.*)\\] (.*) \\[ORG: (.*)\\]\"\n org_pattern = re.compile(r\"(\\w*\\.?)\\/ORGANIZATION\")\n rels_search = re.search(rels_pattern, i)\n if rels_search != None:\n org_name = org_pattern.findall(rels_search.group(3))\n else:\n org_name = \"\"\n \n final.append(\" \".join(org_name))\n return final",
"prompt_number": 510,
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "\"\"\"Extract values from the education and profession keys from the reldicts for analysis\"\"\"\n\nedu_values1984, prof_values1984 = extract_relsdict_values(rel_dict_1984)\nedu_values1990, prof_values1990 = extract_relsdict_values(rel_dict_1990)\nedu_values2000, prof_values2000 = extract_relsdict_values(rel_dict_2000)\nedu_values2010, prof_values2010 = extract_relsdict_values(rel_dict_2010)\nedu_values2014, prof_values2014 = extract_relsdict_values(rel_dict_2014)",
"prompt_number": 511,
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "\"\"\"Run fd on the education key from the reldicts for analysis\"\"\"\n\nedu1984_fd = nltk.FreqDist(extract_edu(edu_values1984))\nedu1990_fd = nltk.FreqDist(extract_edu(edu_values1990))\nedu2000_fd = nltk.FreqDist(extract_edu(edu_values2000))\nedu2010_fd = nltk.FreqDist(extract_edu(edu_values2010))\nedu2014_fd = nltk.FreqDist(extract_edu(edu_values2014))\n\ncsv_str = \"\"\nfor school,count in edu1984_fd.items()[:20]:\n csv_str += \"1984,\"+str(school)+\",\"+str(count)+\"\\n\"\nfor school,count in edu1990_fd.items()[:20]:\n csv_str += \"1990,\"+str(school)+\",\"+str(count)+\"\\n\"\nfor school,count in edu2000_fd.items()[:20]:\n csv_str += \"2000,\"+str(school)+\",\"+str(count)+\"\\n\"\nfor school,count in edu2010_fd.items()[:20]:\n csv_str += \"2010,\"+str(school)+\",\"+str(count)+\"\\n\"\nfor school,count in edu2014_fd.items()[:20]:\n csv_str += \"2014,\"+str(school)+\",\"+str(count)+\"\\n\"\n\ncsv_file = open('csv_file_schools.csv','w')\ncsv_file.write(csv_str)\ncsv_file.close()",
"prompt_number": 519,
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "\"\"\"Extract and format values from the profession key from the reldicts for analysis\"\"\"\n\ndef extract_prof (alist):\n final = []\n for i in alist:\n rels_pattern = r\"(?:\\[PER: (.*)\\]) (.*) (?:\\[ORG: (.*)\\])\"\n org_pattern = re.compile(r'.*\\b([Pp]rofessor|[Bb]anker|[Pp]rogrammer|[Aa]nalyst|[Aa]ssociate|[Hh]ead[master?]|[Cc]onsultant|[Cc]hairman|[Dd]octorate|[Aa]ccountant|[Ff]reelance|[Pp]artner|[Mm]anager|[Tt]eacher|[Ll]awyer|[Pp]resident|[Dd]ean|[Ee]ngineer|[Aa]ssistant|[Dd]irector|[Ee]xecutive)\\b')\n\n rels_search = re.search(rels_pattern, i)\n if rels_search != None:\n org_name = org_pattern.findall(rels_search.group(0))\n else:\n org_name = \"\"\n \n final.append(\" \".join(org_name))\n #print len(final)\n return final",
"prompt_number": 496,
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "\"\"\"Run fd on the profession key from the reldicts for analysis\"\"\"\n\nprof1984_fd = nltk.FreqDist(extract_edu(prof_values1984))\nprof1990_fd = nltk.FreqDist(extract_edu(prof_values1990))\nprof2000_fd = nltk.FreqDist(extract_edu(prof_values2000))\nprof2010_fd = nltk.FreqDist(extract_edu(prof_values2010))\nprof2014_fd = nltk.FreqDist(extract_edu(prof_values2014))\n\n\nprint \"1984 - profession fd\"\nprint \"=\" * 100 , '\\n'\nfor i in prof1984_fd.items()[1:21]:\n print i[0] ,i[1] , '\\n'\n\nprint \"1990 - profession fd\"\nprint \"=\" * 100 , '\\n'\nfor i in prof1990_fd.items()[1:21]:\n print i[0] ,'\\n'\n\nprint \"2000 - profession fd\"\nprint \"=\" * 100 , '\\n'\nfor i in prof2000_fd.items()[1:21]:\n print i[0], '\\n'\n\nprint \"2010 - profession fd\"\nprint \"=\" * 100 , '\\n'\nfor i in prof2010_fd.items()[1:21]:\n print i[0] , '\\n'\n\nprint \"2014 - profession fd\"\nprint \"=\" * 100 , '\\n'\nfor i in prof2014_fd.items()[1:21]:\n print i[0] , '\\n'",
"prompt_number": 509,
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "535\n525"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\n361"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\n106"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\n236"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\n1984 - profession fd\n==================================================================================================== \n\npresident 101 \n\nmanager 43 \n\ndirector 42 \n\nexecutive 25 \n\nassistant 18 \n\nteacher 16 \n\nassociate 15 \n\nconsultant 14 \n\npartner 14 \n\nanalyst 12 \n\nchairman 11 \n\nprofessor 11 \n\nlawyer 5 \n\naccountant 4 \n\nDean 3 \n\nbanker 3 \n\nengineer 3 \n\ndean 2 \n\ndoctorate 1 \n\n1990 - profession fd\n==================================================================================================== \n\npresident \n\ndirector \n\nassociate \n\nmanager \n\nteacher \n\nassistant \n\nexecutive \n\npartner \n\nanalyst \n\nprofessor \n\nconsultant \n\nengineer \n\nchairman \n\ndean \n\nlawyer \n\nExecutive \n\naccountant \n\nProfessor \n\nfreelance \n\nheads \n\n2000 - profession fd\n==================================================================================================== \n\ndirector \n\npresident \n\nmanager \n\nassociate \n\npartner \n\nexecutive \n\nprofessor \n\nanalyst \n\nconsultant \n\nDean \n\nassistant \n\nteacher \n\nbanker \n\nengineer \n\nchairman \n\nlawyer \n\ndean \n\n2010 - profession fd\n==================================================================================================== \n\npresident \n\nmanager \n\nassociate \n\ndirector \n\npartner \n\nanalyst \n\nprofessor \n\nteacher \n\naccountant \n\nconsultant \n\ndoctorate \n\nengineer \n\nexecutive \n\nlawyer \n\n2014 - profession fd\n==================================================================================================== \n\ndirector \n\nmanager \n\nassociate \n\nanalyst \n\npresident \n\nteacher \n\nexecutive \n\nassistant \n\npartner \n\nlawyer \n\nprofessor \n\nconsultant \n\nAssociate \n\nProfessor \n\nchairman \n\ndean \n\n"
}
],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "print prof1984_fd",
"prompt_number": 498,
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "<FreqDist: '': 192, 'president': 101, 'manager': 43, 'director': 42, 'executive': 25, 'assistant': 18, 'teacher': 16, 'associate': 15, 'consultant': 14, 'partner': 14, ...>\n"
}
],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "A = re.compile(r'.*\\ba\\b')\nmake_rels_dict(custom_tagged_1984,rel_dict,'widow','PERSON','W',A)\nprint \"widow key has been added to master dict\"",
"prompt_number": 184,
"outputs": [
{
"ename": "ValueError",
"evalue": "your value for the object type has not been recognized: W",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-184-2a43633947e9>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mA\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mr'.*\\ba\\b'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mmake_rels_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcustom_tagged_1984\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mrel_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'widow'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'PERSON'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'W'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mA\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0;34m\"widow key has been added to master dict\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m<ipython-input-175-c8d1bf77bc45>\u001b[0m in \u001b[0;36mmake_rels_dict\u001b[0;34m(tagged_data, rel_dict, relKey, rel1, rel2, regex)\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mdoc\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtagged_data\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mtree\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mchunker_rules\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdoc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mextract_rels\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mrel_dict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdict_key\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrelKey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrel1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrel2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtree\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mregex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0mdict_key\u001b[0m \u001b[0;34m+=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mrel_dict\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m<ipython-input-174-5bfe82b2bc8b>\u001b[0m in \u001b[0;36mextract_rels\u001b[0;34m(rel_dict, dict_key, relKey, rel1, rel2, tree, regex)\u001b[0m\n\u001b[1;32m 12\u001b[0m 5) tree = the parsed tree\n\u001b[1;32m 13\u001b[0m \"\"\" \n\u001b[0;32m---> 14\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mrel\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mnltk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msem\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextract_rels\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrel1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrel2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtree\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpattern\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mregex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 15\u001b[0m \u001b[0mdict_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0mdict_values\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnltk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msem\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrelextract\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshow_raw_rtuple\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/sufia/anaconda/lib/python2.7/site-packages/nltk/sem/relextract.pyc\u001b[0m in \u001b[0;36mextract_rels\u001b[0;34m(subjclass, objclass, doc, corpus, pattern, window)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[0msubjclass\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_expand\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msubjclass\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 211\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"your value for the subject type has not been recognized: %s\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0msubjclass\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 212\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mobjclass\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mobjclass\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mNE_CLASSES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0m_expand\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobjclass\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mNE_CLASSES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: your value for the object type has not been recognized: W"
],
"output_type": "pyerr"
}
],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "\"\"\"Code to assess the marriage location in -- here I will try and the relevant results as a dictionary in itself\"\"\"\n\nAT = re.compile(r'.*\\b[Aa]t\\b')\n\ndef wedding_location_finder (tagged_data,regex, mprint=False):\n marriage_location = []\n\n for doc in tagged_data:\n #Parse every document \n tree = chunker_rules(doc)\n #Relationship Extractors - #1\n \n for rel in nltk.sem.extract_rels('PERSON','ORGANIZATION', tree, pattern = regex):\n marriage_location.append(nltk.sem.relextract.show_raw_rtuple(rel))\n for rel in nltk.sem.extract_rels('LOCATION','ORGANIZATION', tree, pattern = regex):\n marriage_location.append(nltk.sem.relextract.show_raw_rtuple(rel))\n for rel in nltk.sem.extract_rels('PERSON','LOCATION', tree, pattern = regex):\n marriage_location.append(nltk.sem.relextract.show_raw_rtuple(rel))\n for rel in nltk.sem.extract_rels('LOCATION','PERSON', tree, pattern = regex):\n marriage_location.append(nltk.sem.relextract.show_raw_rtuple(rel))\n\n # \"Creating a marriage location dictionary\"\n marriage_loc_dict = {'marriage_loc':[]}\n loc = []\n other = []\n\n # \"Running code to restrict the wide net to see where the wedding took place\"\n for i in marriage_location:\n if 'performed' in i or 'arrie' in i or 'Weds' in i or 'officiate' in i or 'Temple' in i or 'Church' in i or 'church' in i:\n loc.append(i)\n marriage_loc_dict['marriage_loc'].append(i)\n else:\n other.append(i)\n \n if mprint == True:\n print \"=\" * 125 \n print \"First pass - regex patterns AT: note - casts a wide net \" , len (marriage_location)\n print \"=\" * 125 , \"\\n\"\n for i in marriage_location[:5]:\n print i + '\\n' \n\n print \"=\" * 125 \n print \"Marriage location - the ones that make it in = \" , len(loc)\n print \"=\" * 125 , \"\\n\"\n for i in loc[:5]:\n print i , '\\n'\n\n print \"=\" * 125 \n print \"Marriage location - the ones that didn't make it in = \" , len(other) \n print \"=\" * 125 , \"\\n\"\n for i in other[:5]:\n print i , '\\n'\n\n return marriage_loc_dict.values()\n \n ",
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "wedding_loc_1984 = wedding_location_finder(custom_tagged_1984,AT, mprint=False)",
"prompt_number": 363,
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "**Sample output from wedding_loc_1984 run**\n\n**=============================================================================================================================\nFirst pass - regex patterns AT: note - casts a wide net 1805\n============================================================================================================================**\n\n[PER: 'George/PERSON Eckstein/PERSON'] 'performed/O the/O ceremony/O at/O the/O' [ORG: 'Long/ORGANIZATION Ridge/ORGANIZATION Congregational/ORGANIZATION Church/ORGANIZATION']\n\n[LOC: 'White/LOCATION Plains/LOCATION'] ',/O is/O studying/O for/O a/O M.B.A./O degree/O at/O' [ORG: 'Columbia/ORGANIZATION University/ORGANIZATION']\n\n[LOC: 'N.Y./LOCATION'] ',/O and/O an/O adjunct/O associate/O professor/O at/O the/O' [ORG: 'C.V./ORGANIZATION Starr/ORGANIZATION Center/ORGANIZATION for/ORGANIZATION Applied/ORGANIZATION Economics/ORGANIZATION']\n\n[LOC: 'Stony/LOCATION Brook/LOCATION'] './O Her/O father/O is/O head/O of/O operations/O at/O' [ORG: 'L./ORGANIZATION F./ORGANIZATION Rothschild/ORGANIZATION Unterberg/ORGANIZATION Towbin/ORGANIZATION']\n\n[PER: 'Mr./PERSON Fleming/PERSON'] 'are/O senior/O vice/O presidents/O at/O' [LOC: 'Moseley/LOCATION']\n\n**=============================================================================================================================\nMarriage location - the ones that make it in = 765\n============================================================================================================================**\n\n[PER: 'George/PERSON Eckstein/PERSON'] 'performed/O the/O ceremony/O at/O the/O' [ORG: 'Long/ORGANIZATION Ridge/ORGANIZATION Congregational/ORGANIZATION Church/ORGANIZATION'] \n\n[PER: 'Robert/PERSON Cowperthwaite/PERSON'] 'performed/O the/O Episcopal/O ceremony/O at/O' [ORG: \"St./ORGANIZATION Paul/ORGANIZATION 's/ORGANIZATION Chapel/ORGANIZATION of/ORGANIZATION Trinity/ORGANIZATION Church/ORGANIZATION\"] \n\n[PER: 'Thomas/PERSON D./PERSON Bowers/PERSON'] 'performed/O the/O ceremony/O at/O' [ORG: 'St./ORGANIZATION Bartholomew/ORGANIZATION'] \n\n[PER: 'Clinton/PERSON'] './O The/O nondenominational/O ceremony/O was/O performed/O at/O the/O' [ORG: 'Hamilton/ORGANIZATION College/ORGANIZATION Chapel/ORGANIZATION'] \n\n[PER: 'W./PERSON James/PERSON White/PERSON'] 'performed/O the/O ceremony/O at/O the/O' [ORG: 'United/ORGANIZATION Methodist/ORGANIZATION Church/ORGANIZATION'] \n\n**=============================================================================================================================\nMarriage location - the ones that didn't make it in = 1040\n============================================================================================================================**\n\n[LOC: 'White/LOCATION Plains/LOCATION'] ',/O is/O studying/O for/O a/O M.B.A./O degree/O at/O' [ORG: 'Columbia/ORGANIZATION University/ORGANIZATION'] \n\n[LOC: 'N.Y./LOCATION'] ',/O and/O an/O adjunct/O associate/O professor/O at/O the/O' [ORG: 'C.V./ORGANIZATION Starr/ORGANIZATION Center/ORGANIZATION for/ORGANIZATION Applied/ORGANIZATION Economics/ORGANIZATION'] \n\n[LOC: 'Stony/LOCATION Brook/LOCATION'] './O Her/O father/O is/O head/O of/O operations/O at/O' [ORG: 'L./ORGANIZATION F./ORGANIZATION Rothschild/ORGANIZATION Unterberg/ORGANIZATION Towbin/ORGANIZATION'] \n\n[PER: 'Mr./PERSON Fleming/PERSON'] 'are/O senior/O vice/O presidents/O at/O' [LOC: 'Moseley/LOCATION'] \n\n[PER: 'Susan/PERSON Davis/PERSON Wiltshire/PERSON'] ',/O is/O a/O senior/O consultant/O at/O' [ORG: 'Research/ORGANIZATION and/ORGANIZATION Planning/ORGANIZATION Inc./ORGANIZATION'] "
},
{
"metadata": {},
"cell_type": "code",
"input": "wedding_loc_1990 = wedding_location_finder(custom_tagged_1990,AT, mprint=False)",
"prompt_number": 394,
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "wedding_loc_2000 = wedding_location_finder(custom_tagged_2000,AT, mprint=False)",
"prompt_number": 395,
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "wedding_loc_2010 = wedding_location_finder(custom_tagged_2010,AT, mprint=False)",
"prompt_number": 396,
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "wedding_loc_2014 = wedding_location_finder(custom_tagged_2014,AT, mprint=False)",
"prompt_number": 393,
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "from compiler.ast import flatten\nwedding_loc_1984_flat = flatten(wedding_loc_1984)\nwedding_loc_1990_flat = flatten(wedding_loc_1990)\nwedding_loc_2000_flat = flatten(wedding_loc_2000)\nwedding_loc_2010_flat = flatten(wedding_loc_2010)\nwedding_loc_2014_flat = flatten(wedding_loc_2014)",
"prompt_number": 397,
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "def extract_wedding_location (wedding_locs):\n final = []\n for i in wedding_locs:\n rels_pattern = r\"\\[PER: (.*)\\] (.*) \\[ORG: (.*)\\]\"\n org_pattern = re.compile(r\"(\\w*\\.?)\\/ORGANIZATION\")\n rels_search = re.search(rels_pattern, i)\n \n if rels_search != None:\n org_name = org_pattern.findall(rels_search.group(3))\n else:\n org_name = \"\"\n \n final.append(\" \".join(org_name))\n return final\n",
"prompt_number": 387,
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "pre1_wedding_loc_1984 = extract_wedding_location(wedding_loc_1984_flat)\npre2_wedding_loc_1984 = [nltk.word_tokenize(i) for i in pre1_wedding_loc_1984]\nwedding_loc_1984_final = flatten(pre2_wedding_loc_1984)\n\nwedding_loc_1984_fd = nltk.FreqDist(wedding_loc_1984_final)\nfor i in wedding_loc_1984_fd.items()[:10]:\n print i , '\\n'",
"prompt_number": 417,
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "('Church', 364) \n\n('St.', 238) \n\n('Episcopal', 159) \n\n('Catholic', 99) \n\n('Roman', 97) \n\n('of', 90) \n\n('s', 68) \n\n('Temple', 48) \n\n('Christ', 39) \n\n('Club', 38) \n\n"
}
],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "pre1_wedding_loc_1990 = extract_wedding_location(wedding_loc_1990_flat)\npre2_wedding_loc_1990 = [nltk.word_tokenize(i) for i in pre1_wedding_loc_1990]\nwedding_loc_1990_final = flatten(pre2_wedding_loc_1990)\n\nwedding_loc_1990_fd = nltk.FreqDist(wedding_loc_1990_final)\nfor i in wedding_loc_1990_fd.items()[:10]:\n print i , '\\n'",
"prompt_number": 398,
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "('Church', 454) \n\n('St.', 287) \n\n('Catholic', 151) \n\n('Roman', 151) \n\n('Episcopal', 147) \n\n('of', 109) \n\n('s', 96) \n\n('Club', 93) \n\n('Temple', 69) \n\n('John', 43) \n\n"
}
],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "pre1_wedding_loc_2000 = extract_wedding_location(wedding_loc_2000_flat)\npre2_wedding_loc_2000 = [nltk.word_tokenize(i) for i in pre1_wedding_loc_2000]\nwedding_loc_2000_final = flatten(pre2_wedding_loc_2000)\n\nwedding_loc_2000_fd = nltk.FreqDist(wedding_loc_2000_final)\nfor i in wedding_loc_2000_fd.items()[:10]:\n print i , '\\n'",
"prompt_number": 411,
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "('Church', 297) \n\n('St.', 177) \n\n('Catholic', 96) \n\n('Roman', 95) \n\n('of', 84) \n\n('Episcopal', 78) \n\n('Club', 61) \n\n('s', 58) \n\n('John', 23) \n\n('Congregational', 21) \n\n"
}
],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "pre1_wedding_loc_2010 = extract_wedding_location(wedding_loc_2010_flat)\npre2_wedding_loc_2010 = [nltk.word_tokenize(i) for i in pre1_wedding_loc_2010]\nwedding_loc_2010_final = flatten(pre2_wedding_loc_2010)\n\nwedding_loc_2010_fd = nltk.FreqDist(wedding_loc_2010_final)\nfor i in wedding_loc_2010_fd.items()[:10]:\n print i , '\\n'",
"prompt_number": 412,
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "('Church', 73) \n\n('St.', 40) \n\n('Club', 38) \n\n('of', 24) \n\n('Catholic', 22) \n\n('House', 15) \n\n('Country', 14) \n\n('Roman', 13) \n\n('s', 11) \n\n('Hotel', 10) \n\n"
}
],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "pre1_wedding_loc_2014 = extract_wedding_location(wedding_loc_2014_flat)\npre2_wedding_loc_2014 = [nltk.word_tokenize(i) for i in pre1_wedding_loc_2014]\nwedding_loc_2014_final = flatten(pre2_wedding_loc_2014)\n\nwedding_loc_2014_fd = nltk.FreqDist(wedding_loc_2014_final)\nfor i in wedding_loc_2014_fd.items()[:10]:\n print i , '\\n'",
"prompt_number": 413,
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "('Church', 151) \n\n('St.', 96) \n\n('Club', 56) \n\n('Catholic', 43) \n\n('of', 39) \n\n('House', 26) \n\n('s', 26) \n\n('Roman', 22) \n\n('Episcopal', 16) \n\n('Chapel', 15) \n\n"
}
],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "# Use functions below to see how well the individual patterns do and then add to the master dictionary",
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "# Tester Function to extract the relationships for individual patterns \ndef test_extract_rels (tagged_data, alist, rel1,rel2,regex):\n for doc in tagged_data:\n tree = chunker_rules(doc)\n for rel in nltk.sem.extract_rels(rel1, rel2, tree, pattern = regex):\n #print nltk.sem.relextract.show_raw_rtuple(rel)\n alist.append(nltk.sem.relextract.show_raw_rtuple(rel)) \n return alist ",
"prompt_number": 543,
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Tester Function to append lists if needed \ndef append_rels(lists_to_append):\n master = []\n for i in lists_to_append:\n for rel in i: \n master.append(i)\n return master",
"prompt_number": 544,
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Print function \ndef print_rels(rels):\n print \"length of list: \",len(rels)\n print \"=\" * 125 , \"\\n\"\n for i in rels[:5]:\n print i ",
"prompt_number": 545,
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#####Testing Individual Regex Patterns to add to master#########",
"prompt_number": 538,
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Task1: Genders of who is being married\n#The bride is being extracted - related regex\nbride = []\nDAUGHTER = re.compile(r'.*\\bdaughter\\b')\nbride = test_extract_rels(custom_tagged_1984, bride, 'PERSON','PERSON',DAUGHTER)\nprint_rels(bride)",
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#OPEN QUESTION: Am I supposed to check which ones got picked and then append it to the bride list???\n#Task 1 - continued , extracting the bride \n\n#Create a list \nmarries = []\n\n#Define Regex \nMARRIES = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\nWED = re.compile(r'.*\\b[Ww]eds?\\b')\nENAGEGEMENT = re.compile(r'.*\\b([Ee]ngaged|[Ee]ngagement|[Ee]gagment\\b)')\n\n#Run Relationship Extraction Function \nmarries1 = test_extract_rels(custom_tagged_1984, marries, 'PERSON','ORGANIZATION',MARRIES)\nmarries2 = test_extract_rels(custom_tagged_1984, marries, 'PERSON','PERSON',MARRIES)\nmarries3 = test_extract_rels(custom_tagged_1984, marries, 'PERSON','PERSON',WED)\nmarries4 = test_extract_rels(custom_tagged_1984, marries, 'PERSON','PERSON',ENAGEGEMENT)\n\n# Append Lists above \nlists = [marries1,marries2,marries3,marries4]\nmaster_marries = append_rels(lists)\nprint len(master_marries)\nprint '=' *100\n#Print Lists\nfor i in master_marries[:5]:\n for j in i: \n print j , \"\\n\"",
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Task1: Genders of who is being married\n#The groom is being extracted - related regex\ngroom = []\nSON = re.compile(r'.*\\bson\\b')\ngroom = test_extract_rels(custom_tagged_1984, groom, 'PERSON','PERSON',SON)\nprint_rels(groom)",
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Task 2 - Hometowns of whom is being married \nhometown = []\nOF = re.compile(r'.*\\bof\\b')\nhometown = test_extract_rels(custom_tagged_1984, hometown,'PERSON','LOCATION',OF)\nprint_rels(hometown)\n# strip out the false positives ",
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "profession = []\nPROF = re.compile(r'.*\\b([Pp]rofessor|[Bb]anker|[Pp]rogrammer|[Aa]nalyst|[Aa]ssociate|[Hh]ead[master?]|[Cc]onsultant|[Cc]hairman|[Dd]octorate|[Aa]ccountant|[Ff]reelance|[Pp]artner|[Mm]anager|[Tt]eacher|[Ll]awyer|[Pp]resident|[Dd]ean|[Ee]ngineer|[Aa]ssistant|[Dd]irector|[Ee]xecutive)\\b')\nprofession = test_extract_rels(custom_tagged_1984, profession,'PERSON','ORGANIZATION',PROF)\nprint_rels(profession)",
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#larger funnel\nprofession_v2 = []\nIS = re.compile(r'.*\\bis\\b')\nprofession_v2 = test_extract_rels(custom_tagged_1984, profession_v2,'PERSON','ORGANIZATION',IS)\nprint_rels(profession_v2)",
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "education = []\nEDU = re.compile(r'.*\\b([Dd]egree|[M.B.A.]|[M.S.]|[M.D.]|[Dd]esigner|[Mm]aster\\'s|[Gg]raduated?|[Ee]nrolled|complet[ing|ed|e])\\b')\neducation = test_extract_rels(custom_tagged_1984, education,'PERSON','ORGANIZATION',EDU)\nprint_rels(education)",
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#larger funnel\neducation_v2 = []\nFROM = re.compile(r'.*\\bfrom\\b')\neducation_v2 = test_extract_rels(custom_tagged_1984, education_v2,'PERSON','ORGANIZATION',FROM)\nprint_rels(education_v2)",
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Marries Extractors",
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "marries = []\nMARRIES = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\n\nfor rel in nltk.sem.extract_rels('PERSON','ORGANIZATION', tree, pattern = MARRIES):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))",
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "MARRIES = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\n\nfor rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = MARRIES):\n# print nltk.sem.relextract.show_raw_rtuple(rel)\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))",
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "WED = re.compile(r'.*\\b[Ww]eds?\\b')\n\nfor rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = WED):\n# print nltk.sem.relextract.show_raw_rtuple(rel)\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))",
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "ENAGEGEMNT = re.compile(r'.*\\b([Ee]ngaged|[Ee]ngagement|[Ee]gagment\\b)')\n\nfor rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = MARRIES):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))",
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "AT = re.compile(r'.*\\b[Aa]t\\b')\n\nfor rel in nltk.sem.extract_rels('PERSON','LOCATION', tree, pattern = AT):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))",
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Putting it all together:",
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Set variable to collect info\nmarries = []\n\nfor doc in tagged_1984:\n #Parse every document \n tree = chunker_rules(doc)\n #Relationship Extractors - #1\n MARRIES = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\n for rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = MARRIES):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))\n #Relationship Extractors - #2\n MARRIESv2 = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\n for rel in nltk.sem.extract_rels('PERSON','ORGANIZATION', tree, pattern = MARRIESv2):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))\n #Relationship Extractors - #3\n WED = re.compile(r'.*\\b[Ww]eds?\\b')\n for rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = WED):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))\n #Relationship Extractors - #4\n ENGAGEMENT = re.compile(r'.*\\b([Ee]ngaged|[Ee]ngagement|[Ee]gagment\\b)')\n for rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = ENGAGEMENT):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))\n ",
"prompt_number": 577,
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "print len(marries)\n\nfor i in marries[:5]:\n print i + '\\n'",
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Alternate preprocessing for non-regex extraction\ncorpus_dict2 = {} # Structure is set up to be Year > Month > Wedding Announcements \ntracker2 = {} #to track wedding announcements that enter the dictionary. Some announcements are social events not weddings.\n\nfor fileid in wordlists.fileids():\n # Split up each wedding announcement in the file by the pattern below - '2 of 600 DOCUMENTS'\n doc_list = re.split('((?m)^\\\\s+[0-9]+\\\\s*of\\\\s*[0-9]+\\\\s+DOCUMENTS)', wordlists.raw(fileid)) \n\n num_docs = 0\n for idx in range (2,len(doc_list),2):\n num_docs += 1\n paragraphs = doc_list[idx].split('\\r\\n\\r\\n')\n sents = []\n for i in range(len(paragraphs)): \n paragraphs[i] = re.sub('\\r\\n', ' ', paragraphs[i].strip())\n sents += sent_detector.tokenize(paragraphs[i])\n\n whole_article_string = \" \".join(sents) \n\n #remove article if it has byline because it would not be a wedding announcement\n if \"BYLINE:\" in whole_article_string:\n continue \n #remove article if it is about Events, not weddings\n if \"future events\" in whole_article_string.lower():\n continue \n\n #find date of article \n date = re.search(date_pattern,whole_article_string)\n m = re.search(mp,date.group(0))\n month = m.group(0)\n y = re.search(yp,date.group(0))\n year = y.group(0)\n\n #remove junk lines and add article to dictionary\n good_lines = []\n for sent in sents:\n if len(sent) != 0: \n #remove junk lines \n unwanted_pattern = r\"\\b(^(The New York Times)$|([0-9]{1})-[0-9]{2}$|[JFMASOND]\\w+ [0-9]{1,2}, ([0-9]{4})(,?) Sunday|^(Copyright) [0-9]{4} (The New York Times Company)$|^(DATELINE:.*)|^(SECTION:.*)|^(LENGTH:.*)|^(LOAD-DATE:.*)|(http:.*)|^(PUBLICATION-TYPE:.*)|^(LANGUAGE:.*)|^(GRAPHIC:.*))\\b\" \n junk_line = re.search(unwanted_pattern, sent)\n if junk_line == None:\n \n good_lines.append(re.sub(r\"WEDDINGS/CELEBRATIONS; \", \"\", sent))\n corpus_dict2.setdefault(year,{}).setdefault(month, []).append(good_lines)\n tracker2.setdefault(fileid,[]).append(doc_list[idx-1])\n\n# print corpus_dict['1984']['March']",
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "same_sex_marriages_counts = {} \nsame_sex_marriages_text = {} \nfor year in (2002,2003,2005,2010,2014):\n same_sex_marriages_counts[str(year)] = {'same_sex_announcement': 0, 'non_same_sex_announcement': 0} \n same_sex_marriages_text[str(year)] = {'same_sex_announcement': [], 'non_same_sex_announcement': []} \n for month in corpus_dict2[str(year)].keys():\n for article in corpus_dict2[str(year)][month]:\n article_str = \" \".join(article)\n if 'bride' not in article_str and 'groom' not in article_str:\n if ('daughter' not in article_str and 'son' in article_str) or ('son' not in article_str and 'daughter' in article_str):\n same_sex_marriages_counts[str(year)]['same_sex_announcement'] += 1\n same_sex_marriages_text[str(year)]['same_sex_announcement'].append(article_str)\n elif ('Mr.' not in article_str and 'Ms.' in article_str) or ('Mr.' in article_str and ('Ms.' not in article_str and 'Mrs.' not in article_str)):\n same_sex_marriages_counts[str(year)]['same_sex_announcement'] += 1\n same_sex_marriages_text[str(year)]['same_sex_announcement'].append(article_str) \n else:\n same_sex_marriages_counts[str(year)]['non_same_sex_announcement'] += 1\n same_sex_marriages_text[str(year)]['non_same_sex_announcement'].append(article_str)\n",
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "widow_counts = {} \nwidow_text = {} \nfor year in (1984,1985,1986,1990,1991,1995,2000,2001,2002,2003,2005,2010,2014):\n widow_counts[str(year)] = {'yes': 0, 'no': 0} \n widow_text[str(year)] = {'yes': [], 'no': []} \n for month in corpus_dict2[str(year)].keys():\n for article in corpus_dict2[str(year)][month]:\n article_str = \" \".join(article)\n if 'widow' not in article_str:\n# if ('daughter' not in article_str and 'son' in article_str) or ('son' not in article_str and 'daughter' in article_str):\n widow_counts[str(year)]['no'] += 1\n widow_text[str(year)]['no'].append(article_str)\n# elif ('Mr.' not in article_str and 'Ms.' in article_str) or ('Mr.' in article_str and ('Ms.' not in article_str and 'Mrs.' not in article_str)):\n# same_sex_marriages_counts[str(year)]['same_sex_announcement'] += 1\n# same_sex_marriages_text[str(year)]['same_sex_announcement'].append(article_str) \n else:\n widow_counts[str(year)]['yes'] += 1\n widow_text[str(year)]['yes'].append(article_str)\n",
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "divorce_counts = {} \ndivorce_text = {} \nfor year in (1984,1985,1986,1990,1991,1995,2000,2001,2002,2003,2005,2010,2014):\n divorce_counts[str(year)] = {'yes': 0, 'no': 0} \n divorce_text[str(year)] = {'yes': [], 'no': []} \n for month in corpus_dict2[str(year)].keys():\n for article in corpus_dict2[str(year)][month]:\n article_str = \" \".join(article)\n if 'divorce' not in article_str:\n# if ('daughter' not in article_str and 'son' in article_str) or ('son' not in article_str and 'daughter' in article_str):\n divorce_counts[str(year)]['no'] += 1\n divorce_text[str(year)]['no'].append(article_str)\n# elif ('Mr.' not in article_str and 'Ms.' in article_str) or ('Mr.' in article_str and ('Ms.' not in article_str and 'Mrs.' not in article_str)):\n# same_sex_marriages_counts[str(year)]['same_sex_announcement'] += 1\n# same_sex_marriages_text[str(year)]['same_sex_announcement'].append(article_str) \n else:\n divorce_counts[str(year)]['yes'] += 1\n divorce_text[str(year)]['yes'].append(article_str)\n",
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "keephername_counts = {} \nkeephername_text = {} \nkhn_pattern = r'keep\\w* her name'\nprof_pattern = r'continu\\w* to use her name professionally'\nfor year in (1984,1985,1986,1990,1991,1995,2000,2001,2002,2003,2005,2010,2014):\n keephername_counts[str(year)] = {'yes': 0, 'no': 0} \n keephername_text[str(year)] = {'yes': [], 'no': []} \n for month in corpus_dict2[str(year)].keys():\n for article in corpus_dict2[str(year)][month]:\n article_str = \" \".join(article)\n match1 = re.search(khn_pattern, article_str)\n match2 = re.search(prof_pattern, article_str)\n if match1 != None or match2 != None:\n# if ('daughter' not in article_str and 'son' in article_str) or ('son' not in article_str and 'daughter' in article_str):\n keephername_counts[str(year)]['yes'] += 1\n keephername_text[str(year)]['yes'].append(article_str)\n# elif ('Mr.' not in article_str and 'Ms.' in article_str) or ('Mr.' in article_str and ('Ms.' not in article_str and 'Mrs.' not in article_str)):\n# same_sex_marriages_counts[str(year)]['same_sex_announcement'] += 1\n# same_sex_marriages_text[str(year)]['same_sex_announcement'].append(article_str) \n else:\n keephername_counts[str(year)]['no'] += 1\n keephername_text[str(year)]['no'].append(article_str)\n",
"outputs": [],
"language": "python",
"trusted": false,
"collapsed": false
}
],
"metadata": {}
}
],
"metadata": {
"name": "",
"signature": "sha256:ca007ec134faa279beea9fc48bc2c8bcf287761b77c467f1c555856f119dc22f"
},
"nbformat": 3
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment