fayeip · December 19, 2014 00:53
diff --git a/NYT_Weddings_12-18-14_Final.ipynb b/NYT_Weddings_12-18-14_Final.ipynb
 {
 "worksheets": [
  {
   "cells": [
    {
     "metadata": {},
     "cell_type": "code",
     "input": "import nltk\nfrom nltk.corpus import PlaintextCorpusReader\nimport re\nfrom itertools import chain\nfrom nltk import tokenize\nfrom nltk.corpus import stopwords\nimport nltk.data\nimport json\nimport pdb\nfrom collections import defaultdict",
     "prompt_number": 1,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Importing corpus\ncorpus_root = 'data'\nwordlists = PlaintextCorpusReader(corpus_root, '.*\\\\.txt')\nsent_detector = nltk.data.load('tokenizers/punkt/english.pickle')",
     "prompt_number": 2,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "heading",
     "source": "Part 1: Preprocessing and cleaning data",
     "level": 3
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Create date regex parameters \ndate_pattern = '((J(anuary|u(ne|ly))|February|Ma(rch|y)|A(pril|ugust)|(((Sept|Nov|Dec)em)|Octo)ber).*([0-9]))'\nmp = '(J(anuary|u(ne|ly))|February|Ma(rch|y)|A(pril|ugust)|(((Sept|Nov|Dec)em)|Octo)ber)'\nyp = '[0-9]{4}'",
     "prompt_number": 3,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "# Testing the patterns \ntest = \"This is the month of November 9, 2014\"\ndate = re.search(date_pattern,test)\nm = re.search(mp,date.group(0))\nmonth = m.group(0)\ny = re.search(yp,date.group(0))\nyear = y.group(0)",
     "prompt_number": 4,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Set up the dictionaries \ncorpus_dict = {}\n\n#Putting it all together\nfor fileid in wordlists.fileids():\n    #Part 1: split of xx of DOCUMENTS \n    doc_list = re.split('((?m)^\\\\s+[0-9]+\\\\s*of\\\\s*[0-9]+\\\\s+DOCUMENTS)', wordlists.raw(fileid))\n    doc_list.pop(0)                 #got rid of garbage first empty line\n    master_list = list()            # put all documents by id, header, footer\n    #print len(doc_list)            # keep for testing -- how many documents within a single file \n    \n    #Part 2: split into id, head and footer and create a triple tuple \n    for idx in range(0, len(doc_list), 2):\n        # add a new tuple of id, header, footer\n        # split condition in order of importance\n        split_conds = ['words\\r\\n\\r\\n', 'Edition\\r\\n\\r\\n', 'Society Desk\\r\\n\\r\\n','Society Desk\\r\\n\\r\\n\\r\\n','DATELINE: Camden, Me.,\\r\\n\\r\\n\\r\\n']\n        doc_split = []\n        for cond in split_conds:\n            doc_split = re.split(cond,doc_list[idx+1], 1)\n            if len(doc_split) == 2:\n                break\n        #Part 2 contd: Error check to see if any of the splits didn't go through \n        if len(doc_split) < 2:\n            doc_parts = (doc_list[idx], doc_split)\n            print \"too few traces\"\n            pdb.set_trace()\n        elif len(doc_split) > 2:\n            print \"too many splits\"\n        else:\n            doc_parts = (doc_list[idx], doc_split[0], doc_split[1])\n#             print doc_split[0]\n#             print '<><><><><><><><><>'\n#             print doc_split[1]\n#             print \"****************************************\"\n        master_list.append(doc_parts) #Create that tuple triple \n    \n    year_counter = []\n    #Part 3: Read the header and extract date \n    for doc in master_list:\n        #Part 3 a: Header cleaning steps \n        clean_header = re.sub(r\"\\b(The New York Times|(DATELINE:.*)|(BYLINE.*)|(.*Correction Appended.*)|(SECTION:.*)|(LENGTH:.*)|(LOAD-DATE:.*)|(http:.*)|(LANGUAGE:.*)|(GRAPHIC:.*)|(Copyright.*)|(Late Edition - Final.*))\\b\", \"\", doc[1])\n        clean_header = clean_header.replace(\"\\r\",\"\").strip()\n        clean_header = [x for x in clean_header.split('\\n') if any(x.isalnum() for x in x)]\n        header_final = ' '.join(clean_header)\n\n        #Part 3b: Extracting the date\n        date = re.search(date_pattern,header_final)\n        m = re.search(mp,date.group(0))\n        month = m.group(0)\n        y = re.search(yp,date.group(0))\n        year = y.group(0)\n        year_counter.append(year)    \n\n        if \"Events\" not in header_final:\n            body = doc[2]\n            clean_sent = re.sub(r\"\\b(The New York Times|(DATELINE:.*)|(SECTION:.*)|(LENGTH:.*)|(LOAD-DATE:.*)|(http:.*)|(URL:.*)|(LANGUAGE:.*)|(PUBLICATION.*)|(GRAPHIC:.*)|(Copyright.*))\\b\", \"\", body)\n            body = re.sub('\\r\\n(?!\\r\\n)', ' ',clean_sent)\n\n            #Part 4 adding to the dictionary\n            corpus_dict.setdefault(year,{}).setdefault(month, []).append((doc[0],header_final,body))     \n            \n#Part 5: Write to a JSON file \nwith open('data/dict2014.json', 'wb') as fp:\n    json.dump(corpus_dict, fp)\n    ",
     "prompt_number": 6,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "heading",
     "source": "Part 2: NER Tagging and Chunking",
     "level": 3
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Download Stanford NER taggers\nfrom nltk.tag.stanford import POSTagger\nfrom nltk.tag.stanford import NERTagger\npost = POSTagger('lib/stanford-postagger-2014-10-26/models/english-bidirectional-distsim.tagger',\n               'lib/stanford-postagger-2014-10-26/stanford-postagger.jar', 'utf-8')\n\nnert = NERTagger('lib/stanford-ner-2014-10-26/classifiers/english.all.3class.distsim.crf.ser.gz',\n               'lib/stanford-ner-2014-10-26/stanford-ner.jar', 'utf-8')",
     "prompt_number": 303,
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Load the entity tagged file as a tuple of tuples \nfrom ast import literal_eval\n\ntagged_1984 = []\n\nwith open('data_tagged/1984_tagged.txt', 'r') as f:\n    for line in f:\n        line.split(',')\n        tagged_1984.append(literal_eval(line.strip()))\n",
     "prompt_number": 9,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "# Import the RegexpParser\nfrom nltk.chunk import RegexpParser\n\n# Define custom tagged entities - group NE's together \n\ndef chunker_rules(values):\n    # Define  custom grammar (modified to be a valid regex).\n    grammar = r'''\n        PERSON:\n                {<PERSON><O><PERSON>+}\n                {<PERSON>+}\n        ORGANIZATION: \n                {<ORGANIZATION>+}\n        LOCATION: \n                {<LOCATION>+}\n        WIDOW:\n                {<W>}\n\n        DIVORCED:\n                {<D>} \n        GROOM:\n                {<G>}\n        BRIDE:\n                {<B>}\n        RELIGIOUS:\n                {<R><PERSON>+<O>+<LOCATION>}\n\n            '''\n    cp = nltk.RegexpParser(grammar) # Create an instance of your custom parser.\n    return cp.parse(values)         # Parse!\n\ndef entity_chunker(tagged_docs):\n    chunks = []\n    for doc in tagged_docs:\n        tree = chunker_rules(doc)\n        for subtree in tree.subtrees():\n            if (subtree.node == 'WIDOW'):\n                leaflist = [leaf[0] for leaf in subtree.leaves()]\n                chunks.append(' '.join(leaflist))\n    return chunks\n ",
     "prompt_number": 7,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "# print len(entity_chunker(tagged_1984))",
     "prompt_number": 11,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Customizing the tagger\n#assigning a custom tag in the word,tag \n\nfrom ast import literal_eval\n\ndef alter_source (sourcefile):\n    f_before = open(sourcefile,'r')\n    f_before_str = f_before.read()\n    f_before.close()\n\n    \"\"\"Customizing the tags labeled 'O'\n        1) Widow , Widower, Widowed     >> label: W for widow\n        2) Mr., Mrs., Adm., Sgt., Dr.   >> label: PERSON\n        3) Rev., Rabbi, priest          >> label: R for religious \n        4) bride                        >> label: B for bride \n        5) bridegroom, groom            >> label: G for groom\n\n    \"\"\"\n\n    f_after_str = ''\n    # Adding the custom tag set 1 - widow\n    f_after_str_1 = re.sub(r\"\\(\\'widow\\', \\'O\\'\\)\", \"('widow', 'W')\",f_before_str)\n    f_after_str_2 = re.sub(r\"\\(\\'widower\\', \\'O\\'\\)\", \"('widower', 'W')\",f_after_str_1)\n    f_after_str_3 = re.sub(r\"\\(\\'widowed\\', \\'O\\'\\)\", \"('widowed', 'W')\",f_after_str_2)\n\n    #Adding the custom tag set 2 - person \n    f_after_str_4 = re.sub(r\"\\(\\'Mr.\\', \\'O\\'\\)\", \"('Mr.', 'PERSON')\",f_after_str_3)\n    f_after_str_5 = re.sub(r\"\\(\\'Mrs.\\', \\'O\\'\\)\", \"('Mrs.', 'PERSON')\",f_after_str_4)\n    f_after_str_6 = re.sub(r\"\\(\\'Adm.\\', \\'O\\'\\)\", \"('Adm.', 'PERSON')\",f_after_str_5)\n    f_after_str_7 = re.sub(r\"\\(\\'Sgt.\\', \\'O\\'\\)\", \"('Sgt.', 'PERSON')\",f_after_str_6)\n    f_after_str_8 = re.sub(r\"\\(\\'Dr.\\', \\'O\\'\\)\", \"('Dr.', 'PERSON')\",f_after_str_7)\n\n\n    #Adding the custom tag set 3 - religious head \n    f_after_str_9 = re.sub(r\"\\(\\'Rev.\\', \\'O\\'\\)\", \"('Rev.', 'R')\",f_after_str_8)\n    f_after_str_10 = re.sub(r\"\\(\\'\\bRabbi\\b\\', \\'O\\'\\)\", \"('Rabbi', 'R')\",f_after_str_9)\n    f_after_str_11 = re.sub(r\"\\(\\'\\bpriest\\b\\', \\'O\\'\\)\", \"('priest','R')\",f_after_str_10)\n\n    # Adding the custom tag set 4 - divorced\n    f_after_str_12 = re.sub(r\"\\(\\'\\bdivorce\\b\\', \\'O\\'\\)\", \"('divorce', 'D')\",f_after_str_11)\n    f_after_str_13 = re.sub(r\"\\(\\'\\bdivorced\\b\\', \\'O\\'\\)\", \"('divorced', 'D')\",f_after_str_12)\n\n    # Adding the custom tag set 4 - divorced\n    f_after_str_14 = re.sub(r\"\\(\\'divorce\\', \\'O\\'\\)\", \"sufia\", f_after_str_13)\n    f_after_str_15 = re.sub(r\"\\(\\'divorced\\', \\'O\\'\\)\", \"('divorced', 'D')\",f_after_str_14)\n\n    # Adding the custom tag set 5 - bride\n    f_after_str_16 = re.sub(r\"\\(\\'\\bbride\\b\\', \\'O\\'\\)\", \"('bride', 'B')\",f_after_str_15)\n\n    # Adding the custom tag set 6 - bridegroom\n    f_after_str_17 = re.sub(r\"\\(\\'\\bbridegroom\\b\\', \\'O\\'\\)\", \"('bridegroom', 'G')\",f_after_str_16)\n    f_after_str_final = re.sub(r\"\\(\\'\\bgroom\\b\\', \\'O\\'\\)\", \"('groom', 'G')\",f_after_str_17)\n    \n    return f_after_str_final\n\n\ndef apply_custom_tags (targetfile, custom_tags):\n    f = open(targetfile,'w')\n    f.write(custom_tags)\n    f.close()\n\n    custom_tag_list = []\n\n    with open(targetfile, 'r') as g:\n        for line in g:\n            line.split('\\n')\n            custom_tag_list.append(literal_eval(line.strip()))\n    return custom_tag_list",
     "prompt_number": 32,
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Run custom tagger for all 5 years\n\ncustom_tagged_1984 = apply_custom_tags('data_tagged/1984_tagged_custom.txt',alter_source('data_tagged/1984_tagged.txt'))\ncustom_tagged_1990 = apply_custom_tags('data_tagged/1990_tagged_custom.txt',alter_source('data_tagged/1990_tagged.txt'))\ncustom_tagged_2000 = apply_custom_tags('data_tagged/2000_tagged_custom.txt',alter_source('data_tagged/2000_tagged.txt'))\ncustom_tagged_2010 = apply_custom_tags('data_tagged/2010_tagged_custom.txt',alter_source('data_tagged/2010_tagged.txt'))\ncustom_tagged_2014 = apply_custom_tags('data_tagged/2014_tagged_custom.txt',alter_source('data_tagged/2014_tagged.txt'))\n\n",
     "prompt_number": 186,
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "heading",
     "source": "Part 3: Creating dictionaries to store extracted data",
     "level": 3
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Staging the dictionaries\nrel_dict_1984 = defaultdict(dict)\nrel_dict_1990 = defaultdict(dict)\nrel_dict_2000 = defaultdict(dict)\nrel_dict_2010 = defaultdict(dict)\nrel_dict_2014 = defaultdict(dict)",
     "prompt_number": 421,
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "def lift_names_remove_tags(relation, extracted_relation_str):\n    \n    if relation == 'DAUGHTER' or relation == 'SON':\n        rels_pattern = r\"\\[PER: (.*)\\] (.*) \\[PER: (.*)\\]\"\n        name_pattern = re.compile(r\"(\\w*\\.?)\\/PERSON\")\n        names_search = re.search(rels_pattern, extracted_relation_str)\n        bride_or_groom_name = name_pattern.findall(names_search.group(1))\n        parents_name = name_pattern.findall(names_search.group(3))\n        bride_or_groom_name_str = ''\n        parents_name_str = ''\n        for bg in bride_or_groom_name:\n            bride_or_groom_name_str += bg + \" \"\n        for p in parents_name:\n            parents_name_str += p + \" \"\n\n    return bride_or_groom_name_str.strip(), parents_name_str.strip()",
     "prompt_number": 173,
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "def extract_rels(rel_dict, dict_key, relKey, rel1, rel2, tree,regex): \n    \"\"\"This function extracts the relationships\n       Function Outputs:\n           final output = {dict_key: [{relKey:[relationship extracted]}, {relKey:[relationship extracted]}]}\n           example =      {1:[{bride:['Mary Flyn marries John Mayer],{groom: ['John is a son of Mr and Mrs Mayer]}}]}\n       \n       Function Inputs:\n           1) rel_dict = This is the default dict that will contain all the patterns in a dictionary per wedding announcement\n           2) dict_key = This is basically a counter per wedding announcment \n           3) relKey   = This is the second key i.e. the relationship type you want to get values for \n           4) rel1 , rel2, regex = 'PERSON' [the word \"marries\"] 'PERSON'\n           5) tree = the parsed tree\n    \"\"\"  \n    for rel in nltk.sem.extract_rels(rel1, rel2, tree, pattern = regex):\n        dict_values = []\n        dict_values.append(nltk.sem.relextract.show_raw_rtuple(rel))\n        rel_dict[str(dict_key)][relKey] = dict_values",
     "prompt_number": 174,
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "def make_rels_dict(tagged_data, rel_dict, relKey, rel1,rel2,regex):\n    \"\"\"This function makes the dictionary for the relationships you want to extract -- \n        read comments in function \"extract_rels\" for more context\"\"\"\n    dict_key = 1\n        \n    for doc in tagged_data:\n        tree = chunker_rules(doc)\n        extract_rels (rel_dict, dict_key, relKey, rel1, rel2, tree,regex)\n        dict_key +=1\n    return rel_dict",
     "prompt_number": 175,
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "DAUGHTER = re.compile(r'.*\\bdaughter\\b')\n\"\"\"Create an entry in a dictionary for the bride based on the pattern called DAUGHTER\"\"\"\n\nmake_rels_dict(custom_tagged_1984,rel_dict_1984,'bride','PERSON','PERSON',DAUGHTER)\nmake_rels_dict(custom_tagged_1990,rel_dict_1990,'bride','PERSON','PERSON',DAUGHTER)\nmake_rels_dict(custom_tagged_2000,rel_dict_2000,'bride','PERSON','PERSON',DAUGHTER)\nmake_rels_dict(custom_tagged_2010,rel_dict_2010,'bride','PERSON','PERSON',DAUGHTER)\nmake_rels_dict(custom_tagged_2014,rel_dict_2014,'bride','PERSON','PERSON',DAUGHTER)\nprint \"Bride key has been added to master dict\"",
     "prompt_number": 422,
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "Bride key has been added to master dict\n"
      }
     ],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "test = rel_dict['347']['bride']\nprint test",
     "prompt_number": 128,
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "[\"[PER: 'Leicia/PERSON Sharon/PERSON Osborne/PERSON'] ',/O the/O daughter/O of/O' [PER: 'Mr./PERSON and/O Mrs./PERSON Philip/PERSON Barry/PERSON Osborne/PERSON']\"]\n"
      }
     ],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "lift_names_remove_tags('DAUGHTER', test[0])",
     "prompt_number": 129,
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 129,
       "metadata": {},
       "text": "('Leicia Sharon Osborne', 'Mr. Mrs. Philip Barry Osborne')"
      }
     ],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "SON = re.compile(r'.*\\bson\\b')\n\"\"\"Create an entry in a dictionary for the groom based on the pattern called SON\"\"\"\n\nmake_rels_dict(custom_tagged_1984,rel_dict_1984,'groom','PERSON','PERSON',SON) \nmake_rels_dict(custom_tagged_1990,rel_dict_1990,'groom','PERSON','PERSON',SON) \nmake_rels_dict(custom_tagged_2000,rel_dict_2000,'groom','PERSON','PERSON',SON) \nmake_rels_dict(custom_tagged_2010,rel_dict_2010,'groom','PERSON','PERSON',SON) \nmake_rels_dict(custom_tagged_2014,rel_dict_2014,'groom','PERSON','PERSON',SON) \nprint \"Groom key has been added to master dict\"",
     "prompt_number": 423,
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "Groom key has been added to master dict\n"
      }
     ],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "groom_test = rel_dict['347']['groom']\nprint groom_test",
     "prompt_number": 132,
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "[\"[PER: 'Michael/PERSON Anthony/PERSON Milano/PERSON'] ',/O a/O son/O of/O' [PER: 'Mr./PERSON and/O Mrs./PERSON John/PERSON A./PERSON Milano/PERSON']\"]\n"
      }
     ],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "lift_names_remove_tags('SON', groom_test[0])",
     "prompt_number": 133,
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 133,
       "metadata": {},
       "text": "('Michael Anthony Milano', 'Mr. Mrs. John A. Milano')"
      }
     ],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Task 2 - Hometowns of whom is being married \nOF = re.compile(r'.*\\bof\\b')\n\"\"\"Create an entry in a dictionary for the marriage location based on the pattern called DAUGHTER\"\"\"\n\nmake_rels_dict(custom_tagged_1984,rel_dict_1984,'hometowns','PERSON','LOCATION',OF) \nmake_rels_dict(custom_tagged_1990,rel_dict_1990,'hometowns','PERSON','LOCATION',OF) \nmake_rels_dict(custom_tagged_2000,rel_dict_2000,'hometowns','PERSON','LOCATION',OF) \nmake_rels_dict(custom_tagged_2010,rel_dict_2010,'hometowns','PERSON','LOCATION',OF) \nmake_rels_dict(custom_tagged_2014,rel_dict_2014,'hometowns','PERSON','LOCATION',OF) \nprint \"Hometowns key has been added to master dict\"",
     "prompt_number": 424,
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "Hometowns key has been added to master dict\n"
      }
     ],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "IS = re.compile(r'.*\\bis\\b')\n\"\"\"Create an entry in a dictionary for profession_v2 based on the pattern called IS\"\"\"\n\nmake_rels_dict(custom_tagged_1984,rel_dict_1984,'profession_v2','PERSON','ORGANIZATION',IS)\nmake_rels_dict(custom_tagged_1990,rel_dict_1990,'profession_v2','PERSON','ORGANIZATION',IS)\nmake_rels_dict(custom_tagged_2000,rel_dict_2000,'profession_v2','PERSON','ORGANIZATION',IS)\nmake_rels_dict(custom_tagged_2010,rel_dict_2010,'profession_v2','PERSON','ORGANIZATION',IS)\nmake_rels_dict(custom_tagged_2014,rel_dict_2014,'profession_v2','PERSON','ORGANIZATION',IS)\nprint \"Professions version 2 (example: Mary May is a teacher at Riverdale School) key has been added to master dict\"",
     "prompt_number": 425,
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "Professions version 2 (example: Mary May is a teacher at Riverdale School) key has been added to master dict\n"
      }
     ],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "EDU = re.compile(r'.*\\b([Dd]egree|[M.B.A.]|[M.S.]|[M.D.]|[Dd]esigner|[Mm]aster\\'s|[Gg]raduated?|[Ee]nrolled?|complet[ing|ed|e])\\b')\n\"\"\"Create an entry in a dictionary for education based on the pattern called EDU\"\"\"\n\nmake_rels_dict(custom_tagged_1984,rel_dict_1984,'education','PERSON','ORGANIZATION',EDU)\nmake_rels_dict(custom_tagged_1990,rel_dict_1990,'education','PERSON','ORGANIZATION',EDU)\nmake_rels_dict(custom_tagged_2000,rel_dict_2000,'education','PERSON','ORGANIZATION',EDU)\nmake_rels_dict(custom_tagged_2010,rel_dict_2010,'education','PERSON','ORGANIZATION',EDU)\nmake_rels_dict(custom_tagged_2014,rel_dict_2014,'education','PERSON','ORGANIZATION',EDU)\nprint \"Education key has been added to master dict\"",
     "prompt_number": 426,
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "Education key has been added to master dict\n"
      }
     ],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "\"\"\"Extract values from the education and profession keys from the reldicts for analysis\"\"\"\n\ndef extract_relsdict_values (rel_dict):\n    range_reldict = len(rel_dict.keys())\n    edu_values = []\n    prof_values = []\n    hometown_values = []\n    \n    for idx in range (0,range_reldict):\n        if rel_dict[str(idx)].has_key('education'):\n            edu_values.append(rel_dict[str(idx)]['education'])\n        elif rel_dict[str(idx)].has_key('profession_v2'):\n            prof_values.append(rel_dict[str(idx)]['profession_v2'])\n\n        \n        edu_values_flat = flatten(edu_values)\n        prof_values_flat = flatten(prof_values)\n    return edu_values_flat, prof_values_flat",
     "prompt_number": 453,
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "\"\"\"Extract and format values from the education key from the reldicts for analysis\"\"\"\n\ndef extract_edu (alist):\n    final = []\n    for i in alist:\n        rels_pattern = r\"\\[PER: (.*)\\] (.*) \\[ORG: (.*)\\]\"\n        org_pattern = re.compile(r\"(\\w*\\.?)\\/ORGANIZATION\")\n        rels_search = re.search(rels_pattern, i)\n        if rels_search != None:\n            org_name = org_pattern.findall(rels_search.group(3))\n        else:\n            org_name = \"\"\n        \n        final.append(\" \".join(org_name))\n    return final",
     "prompt_number": 510,
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "\"\"\"Extract values from the education and profession keys from the reldicts for analysis\"\"\"\n\nedu_values1984, prof_values1984  = extract_relsdict_values(rel_dict_1984)\nedu_values1990, prof_values1990  = extract_relsdict_values(rel_dict_1990)\nedu_values2000, prof_values2000  = extract_relsdict_values(rel_dict_2000)\nedu_values2010, prof_values2010  = extract_relsdict_values(rel_dict_2010)\nedu_values2014, prof_values2014  = extract_relsdict_values(rel_dict_2014)",
     "prompt_number": 511,
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "\"\"\"Run fd on the education key from the reldicts for analysis\"\"\"\n\nedu1984_fd = nltk.FreqDist(extract_edu(edu_values1984))\nedu1990_fd = nltk.FreqDist(extract_edu(edu_values1990))\nedu2000_fd = nltk.FreqDist(extract_edu(edu_values2000))\nedu2010_fd = nltk.FreqDist(extract_edu(edu_values2010))\nedu2014_fd = nltk.FreqDist(extract_edu(edu_values2014))\n\ncsv_str = \"\"\nfor school,count in edu1984_fd.items()[:20]:\n    csv_str += \"1984,\"+str(school)+\",\"+str(count)+\"\\n\"\nfor school,count in edu1990_fd.items()[:20]:\n    csv_str += \"1990,\"+str(school)+\",\"+str(count)+\"\\n\"\nfor school,count in edu2000_fd.items()[:20]:\n    csv_str += \"2000,\"+str(school)+\",\"+str(count)+\"\\n\"\nfor school,count in edu2010_fd.items()[:20]:\n    csv_str += \"2010,\"+str(school)+\",\"+str(count)+\"\\n\"\nfor school,count in edu2014_fd.items()[:20]:\n    csv_str += \"2014,\"+str(school)+\",\"+str(count)+\"\\n\"\n\ncsv_file = open('csv_file_schools.csv','w')\ncsv_file.write(csv_str)\ncsv_file.close()",
     "prompt_number": 519,
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "\"\"\"Extract and format values from the profession key from the reldicts for analysis\"\"\"\n\ndef extract_prof (alist):\n    final = []\n    for i in alist:\n        rels_pattern = r\"(?:\\[PER: (.*)\\]) (.*) (?:\\[ORG: (.*)\\])\"\n        org_pattern = re.compile(r'.*\\b([Pp]rofessor|[Bb]anker|[Pp]rogrammer|[Aa]nalyst|[Aa]ssociate|[Hh]ead[master?]|[Cc]onsultant|[Cc]hairman|[Dd]octorate|[Aa]ccountant|[Ff]reelance|[Pp]artner|[Mm]anager|[Tt]eacher|[Ll]awyer|[Pp]resident|[Dd]ean|[Ee]ngineer|[Aa]ssistant|[Dd]irector|[Ee]xecutive)\\b')\n\n        rels_search = re.search(rels_pattern, i)\n        if rels_search != None:\n            org_name = org_pattern.findall(rels_search.group(0))\n        else:\n            org_name = \"\"\n        \n        final.append(\" \".join(org_name))\n    #print len(final)\n    return final",
     "prompt_number": 496,
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "\"\"\"Run fd on the profession key from the reldicts for analysis\"\"\"\n\nprof1984_fd = nltk.FreqDist(extract_edu(prof_values1984))\nprof1990_fd = nltk.FreqDist(extract_edu(prof_values1990))\nprof2000_fd = nltk.FreqDist(extract_edu(prof_values2000))\nprof2010_fd = nltk.FreqDist(extract_edu(prof_values2010))\nprof2014_fd = nltk.FreqDist(extract_edu(prof_values2014))\n\n\nprint \"1984 - profession fd\"\nprint \"=\" * 100 , '\\n'\nfor i in prof1984_fd.items()[1:21]:\n    print i[0] ,i[1] , '\\n'\n\nprint \"1990 - profession fd\"\nprint \"=\" * 100 , '\\n'\nfor i in  prof1990_fd.items()[1:21]:\n    print i[0] ,'\\n'\n\nprint \"2000 - profession fd\"\nprint \"=\" * 100 , '\\n'\nfor i in prof2000_fd.items()[1:21]:\n    print i[0], '\\n'\n\nprint \"2010 - profession fd\"\nprint \"=\" * 100 , '\\n'\nfor i in prof2010_fd.items()[1:21]:\n    print i[0] , '\\n'\n\nprint \"2014 - profession fd\"\nprint \"=\" * 100 , '\\n'\nfor i in prof2014_fd.items()[1:21]:\n    print i[0] , '\\n'",
     "prompt_number": 509,
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "535\n525"
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "\n361"
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "\n106"
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "\n236"
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "\n1984 - profession fd\n==================================================================================================== \n\npresident 101 \n\nmanager 43 \n\ndirector 42 \n\nexecutive 25 \n\nassistant 18 \n\nteacher 16 \n\nassociate 15 \n\nconsultant 14 \n\npartner 14 \n\nanalyst 12 \n\nchairman 11 \n\nprofessor 11 \n\nlawyer 5 \n\naccountant 4 \n\nDean 3 \n\nbanker 3 \n\nengineer 3 \n\ndean 2 \n\ndoctorate 1 \n\n1990 - profession fd\n==================================================================================================== \n\npresident \n\ndirector \n\nassociate \n\nmanager \n\nteacher \n\nassistant \n\nexecutive \n\npartner \n\nanalyst \n\nprofessor \n\nconsultant \n\nengineer \n\nchairman \n\ndean \n\nlawyer \n\nExecutive \n\naccountant \n\nProfessor \n\nfreelance \n\nheads \n\n2000 - profession fd\n==================================================================================================== \n\ndirector \n\npresident \n\nmanager \n\nassociate \n\npartner \n\nexecutive \n\nprofessor \n\nanalyst \n\nconsultant \n\nDean \n\nassistant \n\nteacher \n\nbanker \n\nengineer \n\nchairman \n\nlawyer \n\ndean \n\n2010 - profession fd\n==================================================================================================== \n\npresident \n\nmanager \n\nassociate \n\ndirector \n\npartner \n\nanalyst \n\nprofessor \n\nteacher \n\naccountant \n\nconsultant \n\ndoctorate \n\nengineer \n\nexecutive \n\nlawyer \n\n2014 - profession fd\n==================================================================================================== \n\ndirector \n\nmanager \n\nassociate \n\nanalyst \n\npresident \n\nteacher \n\nexecutive \n\nassistant \n\npartner \n\nlawyer \n\nprofessor \n\nconsultant \n\nAssociate \n\nProfessor \n\nchairman \n\ndean \n\n"
      }
     ],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "print prof1984_fd",
     "prompt_number": 498,
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "<FreqDist: '': 192, 'president': 101, 'manager': 43, 'director': 42, 'executive': 25, 'assistant': 18, 'teacher': 16, 'associate': 15, 'consultant': 14, 'partner': 14, ...>\n"
      }
     ],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "A = re.compile(r'.*\\ba\\b')\nmake_rels_dict(custom_tagged_1984,rel_dict,'widow','PERSON','W',A)\nprint \"widow key has been added to master dict\"",
     "prompt_number": 184,
     "outputs": [
      {
       "ename": "ValueError",
       "evalue": "your value for the object type has not been recognized: W",
       "traceback": [
        "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
        "\u001b[0;32m<ipython-input-184-2a43633947e9>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mA\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mr'.*\\ba\\b'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mmake_rels_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcustom_tagged_1984\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mrel_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'widow'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'PERSON'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'W'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mA\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0;34m\"widow key has been added to master dict\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
        "\u001b[0;32m<ipython-input-175-c8d1bf77bc45>\u001b[0m in \u001b[0;36mmake_rels_dict\u001b[0;34m(tagged_data, rel_dict, relKey, rel1, rel2, regex)\u001b[0m\n\u001b[1;32m      6\u001b[0m     \u001b[0;32mfor\u001b[0m \u001b[0mdoc\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtagged_data\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m         \u001b[0mtree\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mchunker_rules\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdoc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m         \u001b[0mextract_rels\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mrel_dict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdict_key\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrelKey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrel1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrel2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtree\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mregex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      9\u001b[0m         \u001b[0mdict_key\u001b[0m \u001b[0;34m+=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     10\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mrel_dict\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
        "\u001b[0;32m<ipython-input-174-5bfe82b2bc8b>\u001b[0m in \u001b[0;36mextract_rels\u001b[0;34m(rel_dict, dict_key, relKey, rel1, rel2, tree, regex)\u001b[0m\n\u001b[1;32m     12\u001b[0m            5) tree = the parsed tree\n\u001b[1;32m     13\u001b[0m     \"\"\"  \n\u001b[0;32m---> 14\u001b[0;31m     \u001b[0;32mfor\u001b[0m \u001b[0mrel\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mnltk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msem\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextract_rels\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrel1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrel2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtree\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpattern\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mregex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     15\u001b[0m         \u001b[0mdict_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     16\u001b[0m         \u001b[0mdict_values\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnltk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msem\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrelextract\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshow_raw_rtuple\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
        "\u001b[0;32m/Users/sufia/anaconda/lib/python2.7/site-packages/nltk/sem/relextract.pyc\u001b[0m in \u001b[0;36mextract_rels\u001b[0;34m(subjclass, objclass, doc, corpus, pattern, window)\u001b[0m\n\u001b[1;32m    209\u001b[0m             \u001b[0msubjclass\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_expand\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msubjclass\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    210\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 211\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"your value for the subject type has not been recognized: %s\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0msubjclass\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    212\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mobjclass\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mobjclass\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mNE_CLASSES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    213\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0m_expand\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobjclass\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mNE_CLASSES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
        "\u001b[0;31mValueError\u001b[0m: your value for the object type has not been recognized: W"
       ],
       "output_type": "pyerr"
      }
     ],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "\"\"\"Code to assess the marriage location in -- here I will try and the relevant results as a dictionary in itself\"\"\"\n\nAT = re.compile(r'.*\\b[Aa]t\\b')\n\ndef wedding_location_finder (tagged_data,regex, mprint=False):\n    marriage_location = []\n\n    for doc in tagged_data:\n        #Parse every document \n        tree = chunker_rules(doc)\n        #Relationship Extractors - #1\n        \n        for rel in nltk.sem.extract_rels('PERSON','ORGANIZATION', tree, pattern = regex):\n            marriage_location.append(nltk.sem.relextract.show_raw_rtuple(rel))\n        for rel in nltk.sem.extract_rels('LOCATION','ORGANIZATION', tree, pattern = regex):\n            marriage_location.append(nltk.sem.relextract.show_raw_rtuple(rel))\n        for rel in nltk.sem.extract_rels('PERSON','LOCATION', tree, pattern = regex):\n            marriage_location.append(nltk.sem.relextract.show_raw_rtuple(rel))\n        for rel in nltk.sem.extract_rels('LOCATION','PERSON', tree, pattern = regex):\n            marriage_location.append(nltk.sem.relextract.show_raw_rtuple(rel))\n\n    # \"Creating a marriage location dictionary\"\n    marriage_loc_dict = {'marriage_loc':[]}\n    loc = []\n    other = []\n\n    # \"Running code to restrict the wide net to see where the wedding took place\"\n    for i in marriage_location:\n        if 'performed' in i or 'arrie' in i or 'Weds' in i or 'officiate' in i or 'Temple' in i or 'Church' in i or 'church' in i:\n            loc.append(i)\n            marriage_loc_dict['marriage_loc'].append(i)\n        else:\n            other.append(i)\n    \n    if mprint == True:\n        print \"=\" * 125 \n        print \"First pass - regex patterns AT: note - casts a wide net \" , len (marriage_location)\n        print \"=\" * 125 , \"\\n\"\n        for i in marriage_location[:5]:\n            print i + '\\n'  \n\n        print \"=\" * 125 \n        print \"Marriage location - the ones that make it in = \" , len(loc)\n        print \"=\" * 125 , \"\\n\"\n        for i in loc[:5]:\n            print i , '\\n'\n\n        print \"=\" * 125 \n        print \"Marriage location - the ones that didn't make it in = \" , len(other)        \n        print \"=\" * 125 , \"\\n\"\n        for i in other[:5]:\n            print i , '\\n'\n\n    return marriage_loc_dict.values()\n    \n    ",
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "wedding_loc_1984 = wedding_location_finder(custom_tagged_1984,AT, mprint=False)",
     "prompt_number": 363,
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "markdown",
     "source": "**Sample output from wedding_loc_1984 run**\n\n**=============================================================================================================================\nFirst pass - regex patterns AT: note - casts a wide net  1805\n============================================================================================================================**\n\n[PER: 'George/PERSON Eckstein/PERSON'] 'performed/O the/O ceremony/O at/O the/O' [ORG: 'Long/ORGANIZATION Ridge/ORGANIZATION Congregational/ORGANIZATION Church/ORGANIZATION']\n\n[LOC: 'White/LOCATION Plains/LOCATION'] ',/O is/O studying/O for/O a/O M.B.A./O degree/O at/O' [ORG: 'Columbia/ORGANIZATION University/ORGANIZATION']\n\n[LOC: 'N.Y./LOCATION'] ',/O and/O an/O adjunct/O associate/O professor/O at/O the/O' [ORG: 'C.V./ORGANIZATION Starr/ORGANIZATION Center/ORGANIZATION for/ORGANIZATION Applied/ORGANIZATION Economics/ORGANIZATION']\n\n[LOC: 'Stony/LOCATION Brook/LOCATION'] './O Her/O father/O is/O head/O of/O operations/O at/O' [ORG: 'L./ORGANIZATION F./ORGANIZATION Rothschild/ORGANIZATION Unterberg/ORGANIZATION Towbin/ORGANIZATION']\n\n[PER: 'Mr./PERSON Fleming/PERSON'] 'are/O senior/O vice/O presidents/O at/O' [LOC: 'Moseley/LOCATION']\n\n**=============================================================================================================================\nMarriage location - the ones that make it in =  765\n============================================================================================================================**\n\n[PER: 'George/PERSON Eckstein/PERSON'] 'performed/O the/O ceremony/O at/O the/O' [ORG: 'Long/ORGANIZATION Ridge/ORGANIZATION Congregational/ORGANIZATION Church/ORGANIZATION'] \n\n[PER: 'Robert/PERSON Cowperthwaite/PERSON'] 'performed/O the/O Episcopal/O ceremony/O at/O' [ORG: \"St./ORGANIZATION Paul/ORGANIZATION 's/ORGANIZATION Chapel/ORGANIZATION of/ORGANIZATION Trinity/ORGANIZATION Church/ORGANIZATION\"] \n\n[PER: 'Thomas/PERSON D./PERSON Bowers/PERSON'] 'performed/O the/O ceremony/O at/O' [ORG: 'St./ORGANIZATION Bartholomew/ORGANIZATION'] \n\n[PER: 'Clinton/PERSON'] './O The/O nondenominational/O ceremony/O was/O performed/O at/O the/O' [ORG: 'Hamilton/ORGANIZATION College/ORGANIZATION Chapel/ORGANIZATION'] \n\n[PER: 'W./PERSON James/PERSON White/PERSON'] 'performed/O the/O ceremony/O at/O the/O' [ORG: 'United/ORGANIZATION Methodist/ORGANIZATION Church/ORGANIZATION'] \n\n**=============================================================================================================================\nMarriage location - the ones that didn't make it in =  1040\n============================================================================================================================**\n\n[LOC: 'White/LOCATION Plains/LOCATION'] ',/O is/O studying/O for/O a/O M.B.A./O degree/O at/O' [ORG: 'Columbia/ORGANIZATION University/ORGANIZATION'] \n\n[LOC: 'N.Y./LOCATION'] ',/O and/O an/O adjunct/O associate/O professor/O at/O the/O' [ORG: 'C.V./ORGANIZATION Starr/ORGANIZATION Center/ORGANIZATION for/ORGANIZATION Applied/ORGANIZATION Economics/ORGANIZATION'] \n\n[LOC: 'Stony/LOCATION Brook/LOCATION'] './O Her/O father/O is/O head/O of/O operations/O at/O' [ORG: 'L./ORGANIZATION F./ORGANIZATION Rothschild/ORGANIZATION Unterberg/ORGANIZATION Towbin/ORGANIZATION'] \n\n[PER: 'Mr./PERSON Fleming/PERSON'] 'are/O senior/O vice/O presidents/O at/O' [LOC: 'Moseley/LOCATION'] \n\n[PER: 'Susan/PERSON Davis/PERSON Wiltshire/PERSON'] ',/O is/O a/O senior/O consultant/O at/O' [ORG: 'Research/ORGANIZATION and/ORGANIZATION Planning/ORGANIZATION Inc./ORGANIZATION'] "
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "wedding_loc_1990 = wedding_location_finder(custom_tagged_1990,AT, mprint=False)",
     "prompt_number": 394,
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "wedding_loc_2000 = wedding_location_finder(custom_tagged_2000,AT, mprint=False)",
     "prompt_number": 395,
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "wedding_loc_2010 = wedding_location_finder(custom_tagged_2010,AT, mprint=False)",
     "prompt_number": 396,
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "wedding_loc_2014 = wedding_location_finder(custom_tagged_2014,AT, mprint=False)",
     "prompt_number": 393,
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "from compiler.ast import flatten\nwedding_loc_1984_flat = flatten(wedding_loc_1984)\nwedding_loc_1990_flat = flatten(wedding_loc_1990)\nwedding_loc_2000_flat = flatten(wedding_loc_2000)\nwedding_loc_2010_flat = flatten(wedding_loc_2010)\nwedding_loc_2014_flat = flatten(wedding_loc_2014)",
     "prompt_number": 397,
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "def extract_wedding_location (wedding_locs):\n    final = []\n    for i in wedding_locs:\n        rels_pattern = r\"\\[PER: (.*)\\] (.*) \\[ORG: (.*)\\]\"\n        org_pattern = re.compile(r\"(\\w*\\.?)\\/ORGANIZATION\")\n        rels_search = re.search(rels_pattern, i)\n        \n        if rels_search != None:\n            org_name = org_pattern.findall(rels_search.group(3))\n        else:\n            org_name = \"\"\n        \n        final.append(\" \".join(org_name))\n    return final\n",
     "prompt_number": 387,
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "pre1_wedding_loc_1984 = extract_wedding_location(wedding_loc_1984_flat)\npre2_wedding_loc_1984 = [nltk.word_tokenize(i) for i in pre1_wedding_loc_1984]\nwedding_loc_1984_final = flatten(pre2_wedding_loc_1984)\n\nwedding_loc_1984_fd = nltk.FreqDist(wedding_loc_1984_final)\nfor i in wedding_loc_1984_fd.items()[:10]:\n    print i , '\\n'",
     "prompt_number": 417,
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "('Church', 364) \n\n('St.', 238) \n\n('Episcopal', 159) \n\n('Catholic', 99) \n\n('Roman', 97) \n\n('of', 90) \n\n('s', 68) \n\n('Temple', 48) \n\n('Christ', 39) \n\n('Club', 38) \n\n"
      }
     ],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "pre1_wedding_loc_1990 = extract_wedding_location(wedding_loc_1990_flat)\npre2_wedding_loc_1990 = [nltk.word_tokenize(i) for i in pre1_wedding_loc_1990]\nwedding_loc_1990_final = flatten(pre2_wedding_loc_1990)\n\nwedding_loc_1990_fd = nltk.FreqDist(wedding_loc_1990_final)\nfor i in wedding_loc_1990_fd.items()[:10]:\n    print i , '\\n'",
     "prompt_number": 398,
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "('Church', 454) \n\n('St.', 287) \n\n('Catholic', 151) \n\n('Roman', 151) \n\n('Episcopal', 147) \n\n('of', 109) \n\n('s', 96) \n\n('Club', 93) \n\n('Temple', 69) \n\n('John', 43) \n\n"
      }
     ],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "pre1_wedding_loc_2000 = extract_wedding_location(wedding_loc_2000_flat)\npre2_wedding_loc_2000 = [nltk.word_tokenize(i) for i in pre1_wedding_loc_2000]\nwedding_loc_2000_final = flatten(pre2_wedding_loc_2000)\n\nwedding_loc_2000_fd = nltk.FreqDist(wedding_loc_2000_final)\nfor i in wedding_loc_2000_fd.items()[:10]:\n    print i , '\\n'",
     "prompt_number": 411,
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "('Church', 297) \n\n('St.', 177) \n\n('Catholic', 96) \n\n('Roman', 95) \n\n('of', 84) \n\n('Episcopal', 78) \n\n('Club', 61) \n\n('s', 58) \n\n('John', 23) \n\n('Congregational', 21) \n\n"
      }
     ],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "pre1_wedding_loc_2010 = extract_wedding_location(wedding_loc_2010_flat)\npre2_wedding_loc_2010 = [nltk.word_tokenize(i) for i in pre1_wedding_loc_2010]\nwedding_loc_2010_final = flatten(pre2_wedding_loc_2010)\n\nwedding_loc_2010_fd = nltk.FreqDist(wedding_loc_2010_final)\nfor i in wedding_loc_2010_fd.items()[:10]:\n    print i , '\\n'",
     "prompt_number": 412,
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "('Church', 73) \n\n('St.', 40) \n\n('Club', 38) \n\n('of', 24) \n\n('Catholic', 22) \n\n('House', 15) \n\n('Country', 14) \n\n('Roman', 13) \n\n('s', 11) \n\n('Hotel', 10) \n\n"
      }
     ],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "pre1_wedding_loc_2014 = extract_wedding_location(wedding_loc_2014_flat)\npre2_wedding_loc_2014 = [nltk.word_tokenize(i) for i in pre1_wedding_loc_2014]\nwedding_loc_2014_final = flatten(pre2_wedding_loc_2014)\n\nwedding_loc_2014_fd = nltk.FreqDist(wedding_loc_2014_final)\nfor i in wedding_loc_2014_fd.items()[:10]:\n    print i , '\\n'",
     "prompt_number": 413,
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "('Church', 151) \n\n('St.', 96) \n\n('Club', 56) \n\n('Catholic', 43) \n\n('of', 39) \n\n('House', 26) \n\n('s', 26) \n\n('Roman', 22) \n\n('Episcopal', 16) \n\n('Chapel', 15) \n\n"
      }
     ],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "# Use functions below to see how well the individual patterns do and then add to the master dictionary",
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "# Tester Function to extract the relationships for individual patterns \ndef test_extract_rels (tagged_data, alist, rel1,rel2,regex):\n    for doc in tagged_data:\n        tree = chunker_rules(doc)\n        for rel in nltk.sem.extract_rels(rel1, rel2, tree, pattern = regex):\n            #print nltk.sem.relextract.show_raw_rtuple(rel)\n            alist.append(nltk.sem.relextract.show_raw_rtuple(rel))          \n    return alist  ",
     "prompt_number": 543,
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Tester Function to append lists if needed \ndef append_rels(lists_to_append):\n    master = []\n    for i in lists_to_append:\n        for rel in i: \n            master.append(i)\n    return master",
     "prompt_number": 544,
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Print function \ndef print_rels(rels):\n    print \"length of list: \",len(rels)\n    print \"=\" * 125 , \"\\n\"\n    for i in rels[:5]:\n        print i ",
     "prompt_number": 545,
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#####Testing Individual Regex Patterns to add to master#########",
     "prompt_number": 538,
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Task1: Genders of who is being married\n#The bride is being extracted - related regex\nbride = []\nDAUGHTER = re.compile(r'.*\\bdaughter\\b')\nbride = test_extract_rels(custom_tagged_1984, bride, 'PERSON','PERSON',DAUGHTER)\nprint_rels(bride)",
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#OPEN QUESTION: Am I supposed to check which ones got picked and then append it to the bride list???\n#Task 1 - continued , extracting the bride \n\n#Create a list \nmarries = []\n\n#Define Regex \nMARRIES = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\nWED = re.compile(r'.*\\b[Ww]eds?\\b')\nENAGEGEMENT = re.compile(r'.*\\b([Ee]ngaged|[Ee]ngagement|[Ee]gagment\\b)')\n\n#Run Relationship Extraction Function \nmarries1 = test_extract_rels(custom_tagged_1984, marries, 'PERSON','ORGANIZATION',MARRIES)\nmarries2 = test_extract_rels(custom_tagged_1984, marries, 'PERSON','PERSON',MARRIES)\nmarries3 = test_extract_rels(custom_tagged_1984, marries, 'PERSON','PERSON',WED)\nmarries4 = test_extract_rels(custom_tagged_1984, marries, 'PERSON','PERSON',ENAGEGEMENT)\n\n# Append Lists above \nlists = [marries1,marries2,marries3,marries4]\nmaster_marries = append_rels(lists)\nprint len(master_marries)\nprint '=' *100\n#Print Lists\nfor i in master_marries[:5]:\n    for j in i: \n        print j , \"\\n\"",
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Task1: Genders of who is being married\n#The groom is being extracted - related regex\ngroom = []\nSON = re.compile(r'.*\\bson\\b')\ngroom = test_extract_rels(custom_tagged_1984, groom, 'PERSON','PERSON',SON)\nprint_rels(groom)",
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Task 2 - Hometowns of whom is being married \nhometown = []\nOF = re.compile(r'.*\\bof\\b')\nhometown = test_extract_rels(custom_tagged_1984, hometown,'PERSON','LOCATION',OF)\nprint_rels(hometown)\n# strip out the false positives ",
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "profession = []\nPROF = re.compile(r'.*\\b([Pp]rofessor|[Bb]anker|[Pp]rogrammer|[Aa]nalyst|[Aa]ssociate|[Hh]ead[master?]|[Cc]onsultant|[Cc]hairman|[Dd]octorate|[Aa]ccountant|[Ff]reelance|[Pp]artner|[Mm]anager|[Tt]eacher|[Ll]awyer|[Pp]resident|[Dd]ean|[Ee]ngineer|[Aa]ssistant|[Dd]irector|[Ee]xecutive)\\b')\nprofession = test_extract_rels(custom_tagged_1984, profession,'PERSON','ORGANIZATION',PROF)\nprint_rels(profession)",
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#larger funnel\nprofession_v2 = []\nIS = re.compile(r'.*\\bis\\b')\nprofession_v2 = test_extract_rels(custom_tagged_1984, profession_v2,'PERSON','ORGANIZATION',IS)\nprint_rels(profession_v2)",
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "education = []\nEDU = re.compile(r'.*\\b([Dd]egree|[M.B.A.]|[M.S.]|[M.D.]|[Dd]esigner|[Mm]aster\\'s|[Gg]raduated?|[Ee]nrolled|complet[ing|ed|e])\\b')\neducation = test_extract_rels(custom_tagged_1984, education,'PERSON','ORGANIZATION',EDU)\nprint_rels(education)",
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#larger funnel\neducation_v2 = []\nFROM = re.compile(r'.*\\bfrom\\b')\neducation_v2 = test_extract_rels(custom_tagged_1984, education_v2,'PERSON','ORGANIZATION',FROM)\nprint_rels(education_v2)",
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Marries Extractors",
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "marries = []\nMARRIES = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\n\nfor rel in nltk.sem.extract_rels('PERSON','ORGANIZATION', tree, pattern = MARRIES):\n    marries.append(nltk.sem.relextract.show_raw_rtuple(rel))",
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "MARRIES = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\n\nfor rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = MARRIES):\n#     print nltk.sem.relextract.show_raw_rtuple(rel)\n    marries.append(nltk.sem.relextract.show_raw_rtuple(rel))",
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "WED = re.compile(r'.*\\b[Ww]eds?\\b')\n\nfor rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = WED):\n#     print nltk.sem.relextract.show_raw_rtuple(rel)\n    marries.append(nltk.sem.relextract.show_raw_rtuple(rel))",
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "ENAGEGEMNT = re.compile(r'.*\\b([Ee]ngaged|[Ee]ngagement|[Ee]gagment\\b)')\n\nfor rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = MARRIES):\n    marries.append(nltk.sem.relextract.show_raw_rtuple(rel))",
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "AT = re.compile(r'.*\\b[Aa]t\\b')\n\nfor rel in nltk.sem.extract_rels('PERSON','LOCATION', tree, pattern = AT):\n    marries.append(nltk.sem.relextract.show_raw_rtuple(rel))",
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Putting it all together:",
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Set variable to collect info\nmarries = []\n\nfor doc in tagged_1984:\n    #Parse every document \n    tree = chunker_rules(doc)\n    #Relationship Extractors - #1\n    MARRIES = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\n    for rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = MARRIES):\n        marries.append(nltk.sem.relextract.show_raw_rtuple(rel))\n    #Relationship Extractors - #2\n    MARRIESv2 = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\n    for rel in nltk.sem.extract_rels('PERSON','ORGANIZATION', tree, pattern = MARRIESv2):\n        marries.append(nltk.sem.relextract.show_raw_rtuple(rel))\n    #Relationship Extractors - #3\n    WED = re.compile(r'.*\\b[Ww]eds?\\b')\n    for rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = WED):\n        marries.append(nltk.sem.relextract.show_raw_rtuple(rel))\n    #Relationship Extractors - #4\n    ENGAGEMENT = re.compile(r'.*\\b([Ee]ngaged|[Ee]ngagement|[Ee]gagment\\b)')\n    for rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = ENGAGEMENT):\n        marries.append(nltk.sem.relextract.show_raw_rtuple(rel))\n    ",
     "prompt_number": 577,
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "print len(marries)\n\nfor i in marries[:5]:\n    print i + '\\n'",
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Alternate preprocessing for non-regex extraction\ncorpus_dict2 = {} # Structure is set up to be Year > Month > Wedding Announcements \ntracker2 = {} #to track wedding announcements that enter the dictionary. Some announcements are social events not weddings.\n\nfor fileid in wordlists.fileids():\n    # Split up each wedding announcement in the file by the pattern below - '2 of 600 DOCUMENTS'\n    doc_list = re.split('((?m)^\\\\s+[0-9]+\\\\s*of\\\\s*[0-9]+\\\\s+DOCUMENTS)', wordlists.raw(fileid)) \n\n    num_docs = 0\n    for idx in range (2,len(doc_list),2):\n        num_docs += 1\n        paragraphs = doc_list[idx].split('\\r\\n\\r\\n')\n        sents = []\n        for i in range(len(paragraphs)): \n            paragraphs[i] = re.sub('\\r\\n', ' ', paragraphs[i].strip())\n            sents += sent_detector.tokenize(paragraphs[i])\n\n        whole_article_string = \" \".join(sents) \n\n        #remove article if it has byline because it would not be a wedding announcement\n        if \"BYLINE:\" in whole_article_string:\n            continue \n        #remove article if it is about Events, not weddings\n        if \"future events\" in whole_article_string.lower():\n            continue \n\n        #find date of article \n        date = re.search(date_pattern,whole_article_string)\n        m = re.search(mp,date.group(0))\n        month = m.group(0)\n        y = re.search(yp,date.group(0))\n        year = y.group(0)\n\n        #remove junk lines and add article to dictionary\n        good_lines = []\n        for sent in sents:\n            if len(sent) != 0:            \n                #remove junk lines \n                unwanted_pattern = r\"\\b(^(The New York Times)$|([0-9]{1})-[0-9]{2}$|[JFMASOND]\\w+ [0-9]{1,2}, ([0-9]{4})(,?) Sunday|^(Copyright) [0-9]{4} (The New York Times Company)$|^(DATELINE:.*)|^(SECTION:.*)|^(LENGTH:.*)|^(LOAD-DATE:.*)|(http:.*)|^(PUBLICATION-TYPE:.*)|^(LANGUAGE:.*)|^(GRAPHIC:.*))\\b\"  \n                junk_line = re.search(unwanted_pattern, sent)\n                if junk_line == None:\n                    \n                    good_lines.append(re.sub(r\"WEDDINGS/CELEBRATIONS; \", \"\", sent))\n        corpus_dict2.setdefault(year,{}).setdefault(month, []).append(good_lines)\n        tracker2.setdefault(fileid,[]).append(doc_list[idx-1])\n\n# print corpus_dict['1984']['March']",
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "same_sex_marriages_counts = {} \nsame_sex_marriages_text = {} \nfor year in (2002,2003,2005,2010,2014):\n    same_sex_marriages_counts[str(year)] = {'same_sex_announcement': 0, 'non_same_sex_announcement': 0} \n    same_sex_marriages_text[str(year)] = {'same_sex_announcement': [], 'non_same_sex_announcement': []} \n    for month in corpus_dict2[str(year)].keys():\n        for article in corpus_dict2[str(year)][month]:\n            article_str = \" \".join(article)\n            if 'bride' not in article_str and 'groom' not in article_str:\n                if ('daughter' not in article_str and 'son' in article_str) or ('son' not in article_str and 'daughter' in article_str):\n                    same_sex_marriages_counts[str(year)]['same_sex_announcement'] += 1\n                    same_sex_marriages_text[str(year)]['same_sex_announcement'].append(article_str)\n                elif ('Mr.' not in article_str and 'Ms.' in article_str) or ('Mr.' in article_str and ('Ms.' not in article_str and 'Mrs.' not in article_str)):\n                    same_sex_marriages_counts[str(year)]['same_sex_announcement'] += 1\n                    same_sex_marriages_text[str(year)]['same_sex_announcement'].append(article_str)                    \n            else:\n                same_sex_marriages_counts[str(year)]['non_same_sex_announcement'] += 1\n                same_sex_marriages_text[str(year)]['non_same_sex_announcement'].append(article_str)\n",
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "widow_counts = {} \nwidow_text = {} \nfor year in (1984,1985,1986,1990,1991,1995,2000,2001,2002,2003,2005,2010,2014):\n    widow_counts[str(year)] = {'yes': 0, 'no': 0} \n    widow_text[str(year)] = {'yes': [], 'no': []} \n    for month in corpus_dict2[str(year)].keys():\n        for article in corpus_dict2[str(year)][month]:\n            article_str = \" \".join(article)\n            if 'widow' not in article_str:\n#                 if ('daughter' not in article_str and 'son' in article_str) or ('son' not in article_str and 'daughter' in article_str):\n                widow_counts[str(year)]['no'] += 1\n                widow_text[str(year)]['no'].append(article_str)\n#                 elif ('Mr.' not in article_str and 'Ms.' in article_str) or ('Mr.' in article_str and ('Ms.' not in article_str and 'Mrs.' not in article_str)):\n#                     same_sex_marriages_counts[str(year)]['same_sex_announcement'] += 1\n#                     same_sex_marriages_text[str(year)]['same_sex_announcement'].append(article_str)                    \n            else:\n                widow_counts[str(year)]['yes'] += 1\n                widow_text[str(year)]['yes'].append(article_str)\n",
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "divorce_counts = {} \ndivorce_text = {} \nfor year in (1984,1985,1986,1990,1991,1995,2000,2001,2002,2003,2005,2010,2014):\n    divorce_counts[str(year)] = {'yes': 0, 'no': 0} \n    divorce_text[str(year)] = {'yes': [], 'no': []} \n    for month in corpus_dict2[str(year)].keys():\n        for article in corpus_dict2[str(year)][month]:\n            article_str = \" \".join(article)\n            if 'divorce' not in article_str:\n#                 if ('daughter' not in article_str and 'son' in article_str) or ('son' not in article_str and 'daughter' in article_str):\n                divorce_counts[str(year)]['no'] += 1\n                divorce_text[str(year)]['no'].append(article_str)\n#                 elif ('Mr.' not in article_str and 'Ms.' in article_str) or ('Mr.' in article_str and ('Ms.' not in article_str and 'Mrs.' not in article_str)):\n#                     same_sex_marriages_counts[str(year)]['same_sex_announcement'] += 1\n#                     same_sex_marriages_text[str(year)]['same_sex_announcement'].append(article_str)                    \n            else:\n                divorce_counts[str(year)]['yes'] += 1\n                divorce_text[str(year)]['yes'].append(article_str)\n",
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "keephername_counts = {} \nkeephername_text = {} \nkhn_pattern = r'keep\\w* her name'\nprof_pattern = r'continu\\w* to use her name professionally'\nfor year in (1984,1985,1986,1990,1991,1995,2000,2001,2002,2003,2005,2010,2014):\n    keephername_counts[str(year)] = {'yes': 0, 'no': 0} \n    keephername_text[str(year)] = {'yes': [], 'no': []} \n    for month in corpus_dict2[str(year)].keys():\n        for article in corpus_dict2[str(year)][month]:\n            article_str = \" \".join(article)\n            match1 = re.search(khn_pattern, article_str)\n            match2 = re.search(prof_pattern, article_str)\n            if match1 != None or match2 != None:\n#                 if ('daughter' not in article_str and 'son' in article_str) or ('son' not in article_str and 'daughter' in article_str):\n                keephername_counts[str(year)]['yes'] += 1\n                keephername_text[str(year)]['yes'].append(article_str)\n#                 elif ('Mr.' not in article_str and 'Ms.' in article_str) or ('Mr.' in article_str and ('Ms.' not in article_str and 'Mrs.' not in article_str)):\n#                     same_sex_marriages_counts[str(year)]['same_sex_announcement'] += 1\n#                     same_sex_marriages_text[str(year)]['same_sex_announcement'].append(article_str)                    \n            else:\n                keephername_counts[str(year)]['no'] += 1\n                keephername_text[str(year)]['no'].append(article_str)\n",
     "outputs": [],
     "language": "python",
     "trusted": false,
     "collapsed": false
    }
   ],
   "metadata": {}
  }
 ],
 "metadata": {
  "name": "",
  "signature": "sha256:ca007ec134faa279beea9fc48bc2c8bcf287761b77c467f1c555856f119dc22f"
 },
 "nbformat": 3
 }