fayeip · August 29, 2015 14:11
diff --git a/NYT_Weddings_12-15-14_stripped.ipynb b/NYT_Weddings_12-15-14_stripped.ipynb
 {
 "worksheets": [
  {
   "cells": [
    {
     "metadata": {},
     "cell_type": "code",
     "input": "import nltk\nfrom nltk.corpus import PlaintextCorpusReader\nimport re\nfrom itertools import chain\nfrom nltk import tokenize\nfrom nltk.corpus import stopwords\nimport nltk.data\nimport json\nimport pdb\nfrom collections import defaultdict",
     "prompt_number": 54,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Importing corpus\ncorpus_root = 'data'\nwordlists = PlaintextCorpusReader(corpus_root, '.*\\\\.txt')\nsent_detector = nltk.data.load('tokenizers/punkt/english.pickle')",
     "prompt_number": 2,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Clean up Process - create date regex parameters \ndate_pattern = '((J(anuary|u(ne|ly))|February|Ma(rch|y)|A(pril|ugust)|(((Sept|Nov|Dec)em)|Octo)ber).*([0-9]))'\nmp = '(J(anuary|u(ne|ly))|February|Ma(rch|y)|A(pril|ugust)|(((Sept|Nov|Dec)em)|Octo)ber)'\nyp = '[0-9]{4}'",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "# Testing the patterns \ntest = \"This is the month of November 9, 2014\"\ndate = re.search(date_pattern,test)\nm = re.search(mp,date.group(0))\nmonth = m.group(0)\ny = re.search(yp,date.group(0))\nyear = y.group(0)",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "# start processing\n#Set the dictionaries \ncorpus_dict = {}\n\n#Putting it all together\nfor fileid in wordlists.fileids():\n    #Part 1: split of xx of DOCUMENTS \n    doc_list = re.split('((?m)^\\\\s+[0-9]+\\\\s*of\\\\s*[0-9]+\\\\s+DOCUMENTS)', wordlists.raw(fileid))\n    doc_list.pop(0)                 #got rid of garbage first empty line\n    master_list = list()            # put all documents by id, header, footer\n    #print len(doc_list)            # keep for testing -- how many documents within a single file \n    \n    #Part 2: split into id, head and footer and create a triple tuple \n    for idx in range(0, len(doc_list), 2):\n        # add a new tuple of id, header, footer\n        # split condition in order of importance\n        split_conds = ['words\\r\\n\\r\\n', 'Edition\\r\\n\\r\\n', 'Society Desk\\r\\n\\r\\n','Society Desk\\r\\n\\r\\n\\r\\n','DATELINE: Camden, Me.,\\r\\n\\r\\n\\r\\n']\n        doc_split = []\n        for cond in split_conds:\n            doc_split = re.split(cond,doc_list[idx+1], 1)\n            if len(doc_split) == 2:\n                break\n        #Part 2 contd: Error check to see if any of the splits didn't go through \n        if len(doc_split) < 2:\n            doc_parts = (doc_list[idx], doc_split)\n            print \"too few traces\"\n            pdb.set_trace()\n        elif len(doc_split) > 2:\n            print \"too many splits\"\n        else:\n            doc_parts = (doc_list[idx], doc_split[0], doc_split[1])\n#             print doc_split[0]\n#             print '<><><><><><><><><>'\n#             print doc_split[1]\n#             print \"****************************************\"\n        master_list.append(doc_parts) #Create that tuple triple \n    \n    year_counter = []\n    #Part 3: Read the header and extract date \n    for doc in master_list:\n        #Part 3 a: Header cleaning steps \n        clean_header = re.sub(r\"\\b(The New York Times|(DATELINE:.*)|(BYLINE.*)|(.*Correction Appended.*)|(SECTION:.*)|(LENGTH:.*)|(LOAD-DATE:.*)|(http:.*)|(LANGUAGE:.*)|(GRAPHIC:.*)|(Copyright.*)|(Late Edition - Final.*))\\b\", \"\", doc[1])\n        clean_header = clean_header.replace(\"\\r\",\"\").strip()\n        clean_header = [x for x in clean_header.split('\\n') if any(x.isalnum() for x in x)]\n        header_final = ' '.join(clean_header)\n\n        #Part 3b: Extracting the date\n        date = re.search(date_pattern,header_final)\n        m = re.search(mp,date.group(0))\n        month = m.group(0)\n        y = re.search(yp,date.group(0))\n        year = y.group(0)\n        year_counter.append(year)    \n\n        if \"Events\" not in header_final:\n            body = doc[2]\n            clean_sent = re.sub(r\"\\b(The New York Times|(DATELINE:.*)|(SECTION:.*)|(LENGTH:.*)|(LOAD-DATE:.*)|(http:.*)|(URL:.*)|(LANGUAGE:.*)|(PUBLICATION.*)|(GRAPHIC:.*)|(Copyright.*))\\b\", \"\", body)\n            body = re.sub('\\r\\n(?!\\r\\n)', ' ',clean_sent)\n\n            #Part 4 adding to the dictionary\n            corpus_dict.setdefault(year,{}).setdefault(month, []).append((doc[0],header_final,body))     \n            \n#Part 5: Write to a JSON file \nwith open('data/dict2014.json', 'wb') as fp:\n    json.dump(corpus_dict, fp)\n    ",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "# print corpus_dict['1984']['March']",
     "prompt_number": 55,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Sent to AWS to tag",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Download Stanford NER taggers\nfrom nltk.tag.stanford import POSTagger\nfrom nltk.tag.stanford import NERTagger\npost = POSTagger('lib/stanford-postagger-2014-10-26/models/english-bidirectional-distsim.tagger',\n               'lib/stanford-postagger-2014-10-26/stanford-postagger.jar', 'utf-8')\n\nnert = NERTagger('lib/stanford-ner-2014-10-26/classifiers/english.all.3class.distsim.crf.ser.gz',\n               'lib/stanford-ner-2014-10-26/stanford-ner.jar', 'utf-8')",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Load the entity tagged file as a tuple of tuples \nfrom ast import literal_eval\n\ntagged_1984 = []\n\nwith open('1984_tagged.txt', 'r') as f:\n    for line in f:\n        line.split(',')\n        tagged_1984.append(literal_eval(line.strip()))\n",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Customizing the tagger\n#assigning a custom tag in the word,tag \n\nfrom ast import literal_eval\n\ndef alter_source(sourcefile):\n    f_before = open(sourcefile,'r')\n    f_before_str = f_before.read()\n    f_before.close()\n\n    \"\"\"Customizing the tags labeled 'O'\n        1) Widow , Widower, Widowed     >> label: W for widow\n        2) Mr., Mrs., Adm., Sgt., Dr.   >> label: PERSON\n        3) Rev., Rabbi, priest          >> label: R for religious \n        4) bride                        >> label: B for bride \n        5) bridegroom, groom            >> label: G for groom\n\n    \"\"\"\n\n    f_after_str = ''\n    # Adding the custom tag set 1 - widow\n    f_after_str_1 = re.sub(r\"\\(\\'widow\\', \\'O\\'\\)\", \"('widow', 'W')\",f_before_str)\n    f_after_str_2 = re.sub(r\"\\(\\'widower\\', \\'O\\'\\)\", \"('widower', 'W')\",f_after_str_1)\n    f_after_str_3 = re.sub(r\"\\(\\'widowed\\', \\'O\\'\\)\", \"('widowed', 'W')\",f_after_str_2)\n\n    #Adding the custom tag set 2 - person \n    f_after_str_4 = re.sub(r\"\\(\\'Mr.\\', \\'O\\'\\)\", \"('Mr.', 'PERSON')\",f_after_str_3)\n    f_after_str_5 = re.sub(r\"\\(\\'Mrs.\\', \\'O\\'\\)\", \"('Mrs.', 'PERSON')\",f_after_str_4)\n    f_after_str_6 = re.sub(r\"\\(\\'Adm.\\', \\'O\\'\\)\", \"('Adm.', 'PERSON')\",f_after_str_5)\n    f_after_str_7 = re.sub(r\"\\(\\'Sgt.\\', \\'O\\'\\)\", \"('Sgt.', 'PERSON')\",f_after_str_6)\n    f_after_str_8 = re.sub(r\"\\(\\'Dr.\\', \\'O\\'\\)\", \"('Dr.', 'PERSON')\",f_after_str_7)\n\n\n    #Adding the custom tag set 3 - religious head \n    f_after_str_9 = re.sub(r\"\\(\\'Rev.\\', \\'O\\'\\)\", \"('Rev.', 'R')\",f_after_str_8)\n    f_after_str_10 = re.sub(r\"\\(\\'\\bRabbi\\b\\', \\'O\\'\\)\", \"('Rabbi', 'R')\",f_after_str_9)\n    f_after_str_11 = re.sub(r\"\\(\\'\\bpriest\\b\\', \\'O\\'\\)\", \"('priest','R')\",f_after_str_10)\n\n    # Adding the custom tag set 4 - divorced\n    f_after_str_12 = re.sub(r\"\\(\\'\\bdivorce\\b\\', \\'O\\'\\)\", \"('divorce', 'D')\",f_after_str_11)\n    f_after_str_13 = re.sub(r\"\\(\\'\\bdivorced\\b\\', \\'O\\'\\)\", \"('divorced', 'D')\",f_after_str_12)\n\n    # Adding the custom tag set 4 - divorced\n    f_after_str_14 = re.sub(r\"\\(\\'divorce\\', \\'O\\'\\)\", \"sufia\", f_after_str_13)\n    f_after_str_15 = re.sub(r\"\\(\\'divorced\\', \\'O\\'\\)\", \"('divorced', 'D')\",f_after_str_14)\n\n    # Adding the custom tag set 5 - bride\n    f_after_str_16 = re.sub(r\"\\(\\'\\bbride\\b\\', \\'O\\'\\)\", \"('bride', 'B')\",f_after_str_15)\n\n    # Adding the custom tag set 6 - bridegroom\n    f_after_str_17 = re.sub(r\"\\(\\'\\bbridegroom\\b\\', \\'O\\'\\)\", \"('bridegroom', 'G')\",f_after_str_16)\n    f_after_str_final = re.sub(r\"\\(\\'\\bgroom\\b\\', \\'O\\'\\)\", \"('groom', 'G')\",f_after_str_17)\n    \n    return f_after_str_final\n\n\ndef apply_custom_tags (targetfile, custom_tags):\n    f = open(targetfile,'w')\n    f.write(custom_tags)\n    f.close()\n\n    custom_tag_list = []\n\n    with open(targetfile, 'r') as g:\n        for line in g:\n            line.split('\\n')\n            custom_tag_list.append(literal_eval(line.strip()))\n    return custom_tag_list",
     "prompt_number": 3,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Run custom tagger for all 5 years\n\ncustom_tagged_1984 = apply_custom_tags('data_tagged2/1984_tagged_custom.txt',alter_source('data_tagged/1984_tagged.txt'))\ncustom_tagged_1990 = apply_custom_tags('data_tagged2/1990_tagged_custom.txt',alter_source('data_tagged/1990_tagged.txt'))\ncustom_tagged_2000 = apply_custom_tags('data_tagged2/2000_tagged_custom.txt',alter_source('data_tagged/2000_tagged.txt'))\ncustom_tagged_2010 = apply_custom_tags('data_tagged2/2010_tagged_custom.txt',alter_source('data_tagged/2010_tagged.txt'))\ncustom_tagged_2014 = apply_custom_tags('data_tagged2/2014_tagged_custom.txt',alter_source('data_tagged/2014_tagged.txt'))",
     "prompt_number": 4,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "# print custom_tagged_1984 ",
     "prompt_number": 56,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "# Import the RegexpParser\nfrom nltk.chunk import RegexpParser\n\n# Define your custom tagged data. \n# entities\n\ndef chunker_rules(values):\n    # Define  custom grammar (modified to be a valid regex).\n    grammar = r'''\n        PERSON:\n                {<PERSON><O><PERSON>+}\n                {<PERSON>+}\n        ORGANIZATION: \n                {<ORGANIZATION>+}\n        LOCATION: \n                {<LOCATION>+}        \n\n            '''\n    cp = nltk.RegexpParser(grammar) # Create an instance of your custom parser.\n    return cp.parse(values)         # Parse!\n\ndef entity_chunker(tagged_docs):\n    chunks = []\n    for doc in tagged_docs:\n        tree = chunker_rules(doc)\n        for subtree in tree.subtrees():\n            if (subtree.node == 'CHUNK'):\n                leaflist = [leaf[0] for leaf in subtree.leaves()]\n                chunks.append(' '.join(leaflist))\n#             if verb in leaflist:\n#                 chunks.append(' '.join(leaflist))\n    return chunks\n ",
     "prompt_number": 33,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "# Creating a dictionary for each wedding announcement",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "rel_dict = defaultdict(dict)",
     "prompt_number": 34,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "def extract_rels(rel_dict, dict_key, relKey, rel1, rel2, tree,regex): \n    \"\"\"This function extracts the relationships\n       Function Outputs:\n           final output = {dict_key: [{relKey:[relationship extracted]}, {relKey:[relationship extracted]}]}\n           example =      {1:[{bride:'Mary Flyn',groom: 'John Mayer'}]}\n       \n       Function Inputs:\n           1) rel_dict = This is the default dict that will contain all the patterns in a dictionary per wedding announcement\n           2) dict_key = This is basically a counter per wedding announcment \n           3) relKey   = This is the second key i.e. the relationship type you want to get values for \n           4) rel1 , rel2, regex = 'PERSON' [the word \"marries\"] 'PERSON'\n           5) tree = the parsed tree\n    \"\"\"  \n    for rel in nltk.sem.extract_rels(rel1, rel2, tree, pattern = regex):\n#         print nltk.sem.relextract.show_raw_rtuple(rel)\n#         rels_str = nltk.sem.relextract.show_raw_rtuple(rel) \n        dict_values = []\n        dict_values.append(nltk.sem.relextract.show_raw_rtuple(rel))\n        rel_dict[str(dict_key)][relKey] = dict_values\n        \n#         if relKey in rel_dict[str(dict_key)].keys():\n#             rel_dict[str(dict_key)][relKey].append(dict_values)\n#         else:\n#         rel_dict[str(dict_key)][relKey] = dict_values",
     "prompt_number": 35,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "def make_rels_dict(tagged_data, rel_dict, relKey, rel1,rel2,regex):\n    \"\"\"This function makes the dictionary for the relationships you want to extract -- \n        read comments in function \"extract_rels\" for more context\"\"\"\n    dict_key = 1\n        \n    for doc in tagged_data:\n        tree = chunker_rules(doc)\n        extract_rels (rel_dict, dict_key, relKey, rel1, rel2, tree,regex)\n        dict_key +=1\n    return rel_dict",
     "prompt_number": 36,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "DAUGHTER = re.compile(r'.*\\bdaughter\\b')\nmake_rels_dict(custom_tagged_1984,rel_dict,'daughter_relation','PERSON','PERSON',DAUGHTER)\nprint \"DAUGHTER relation done\"",
     "prompt_number": 57,
     "outputs": [
      {
       "output_type": "stream",
       "text": "DAUGHTER relation done\n",
       "stream": "stdout"
      }
     ],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "# regex to lift out names and remove tags \n\ndef lift_names_remove_tags(relation, extracted_relation_str):\n    \n    if relation == 'DAUGHTER' or relation == 'SON':\n        rels_pattern = r\"\\[PER: (.*)\\] (.*) \\[PER: (.*)\\]\"\n        name_pattern = re.compile(r\"(\\w*\\.?)\\/PERSON\")\n        names_search = re.search(rels_pattern, extracted_relation_str)\n        bride_or_groom_name = name_pattern.findall(names_search.group(1))\n        parents_name = name_pattern.findall(names_search.group(3))\n        bride_or_groom_name_str = ''\n        parents_name_str = ''\n        for bg in bride_or_groom_name:\n            bride_or_groom_name_str += bg + \" \"\n        for p in parents_name:\n            parents_name_str += p + \" \"\n            \n        return bride_or_groom_name_str.strip(), parents_name_str.strip()\n",
     "prompt_number": 58,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "for key in rel_dict.keys():\n    for k in rel_dict[key].keys():\n#         print rel_dict[key][k]\n        for each in rel_dict[key][k]:\n            if k == 'daughter_relation':\n                bridename, brideparentsname = lift_names_remove_tags('DAUGHTER',each)\n                rel_dict[key]['bride_name'] = bridename\n                rel_dict[key]['bride_parents_names'] = brideparentsname ",
     "prompt_number": 59,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "# print rel_dict",
     "prompt_number": 60,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "SON = re.compile(r'.*\\bson\\b')\nmake_rels_dict(custom_tagged_1984,rel_dict,'son_relation','PERSON','PERSON',SON) \nprint \"Groom key has been added to master dict\"",
     "prompt_number": 61,
     "outputs": [
      {
       "output_type": "stream",
       "text": "Groom key has been added to master dict\n",
       "stream": "stdout"
      }
     ],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "for key in rel_dict.keys():\n    for k in rel_dict[key].keys():\n#         print rel_dict[key][k]\n        for each in rel_dict[key][k]:\n            if k == 'son_relation':\n                groomname, groomparentsname = lift_names_remove_tags('SON',each)\n                rel_dict[key]['groom_name'] = groomname\n                rel_dict[key]['groom_parents_names'] = groomparentsname ",
     "prompt_number": 62,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "print rel_dict['347']",
     "prompt_number": 63,
     "outputs": [
      {
       "output_type": "stream",
       "text": "{'daughter_relation': [\"[PER: 'Leicia/PERSON Sharon/PERSON Osborne/PERSON'] ',/O the/O daughter/O of/O' [PER: 'Mr./PERSON and/O Mrs./PERSON Philip/PERSON Barry/PERSON Osborne/PERSON']\"], 'bride_parents_names': 'Mr. Mrs. Philip Barry Osborne', 'groom_parents_names': 'Mr. Mrs. John A. Milano', 'son_relation': [\"[PER: 'Michael/PERSON Anthony/PERSON Milano/PERSON'] ',/O a/O son/O of/O' [PER: 'Mr./PERSON and/O Mrs./PERSON John/PERSON A./PERSON Milano/PERSON']\"], 'bride_name': 'Leicia Sharon Osborne', 'groom_name': 'Michael Anthony Milano'}\n",
       "stream": "stdout"
      }
     ],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "# To test for empty patterns in the rel_dict\nempty = []\nfor i in range (1):\n    if test_dict[str(i)].keys() == []:\n        print i\n        empty.append(i)\nprint \"final len\", len(empty)",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "# Use functions below to see how well the individual patterns do and then add to the master dictionary",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "# Tester Function to extract the relationships for individual patterns \ndef test_extract_rels (tagged_data, alist, rel1,rel2,regex):\n    for doc in tagged_data:\n        tree = chunker_rules(doc)\n        for rel in nltk.sem.extract_rels(rel1, rel2, tree, pattern = regex):\n            #print nltk.sem.relextract.show_raw_rtuple(rel)\n            alist.append(nltk.sem.relextract.show_raw_rtuple(rel))          \n    return alist  ",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Tester Function to append lists if needed \ndef append_rels(lists_to_append):\n    master = []\n    for i in lists_to_append:\n        for rel in i: \n            master.append(i)\n    return master",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Print function \ndef print_rels(rels):\n    print \"length of list: \",len(rels)\n    print \"=\" * 125 , \"\\n\"\n    for i in rels:\n        print i ",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#####Testing Individual Regex Patterns to add to master#########",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Task1: Genders of who is being married\n#The bride is being extracted - related regex\nbride = []\nDAUGHTER = re.compile(r'.*\\bdaughter\\b')\nbride = extract_rels(tagged_1984, bride, 'PERSON','PERSON',DAUGHTER)\nprint_rels(bride)",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#OPEN QUESTION: Am I supposed to check which ones got picked and then append it to the bride list???\n#Task 1 - continued , extracting the bride \n\n#Create a list \nmarries = []\n\n#Define Regex \nMARRIES = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\nWED = re.compile(r'.*\\b[Ww]eds?\\b')\nENAGEGEMENT = re.compile(r'.*\\b([Ee]ngaged|[Ee]ngagement|[Ee]gagment\\b)')\n\n#Run Relationship Extraction Function \nmarries1 = extract_rels(tagged_1984, marries, 'PERSON','ORGANIZATION',MARRIES)\nmarries2 = extract_rels(tagged_1984, marries, 'PERSON','PERSON',MARRIES)\nmarries3 = extract_rels(tagged_1984, marries, 'PERSON','PERSON',WED)\nmarries4 = extract_rels(tagged_1984, marries, 'PERSON','PERSON',ENAGEGEMENT)\n\n# Append Lists above \nlists = [marries1,marries2,marries3,marries4]\nmaster_marries = append_rels(lists)\nprint len(master_marries)\nprint '=' *100\n#Print Lists\nfor i in master_marries:\n    for j in i: \n        print j , \"\\n\"",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Task1: Genders of who is being married\n#The groom is being extracted - related regex\ngroom = []\nSON = re.compile(r'.*\\bson\\b')\ngroom = extract_rels(tagged_1984, groom, 'PERSON','PERSON',SON)\nprint_rels(groom)",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Task 2 - Hometowns of whom is being married \nhometown = []\nOF = re.compile(r'.*\\bof\\b')\nhometown = extract_rels(march1984_tagged, hometown,'PERSON','LOCATION',OF)\nprint_rels(hometown)\n# strip out the false positives ",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "profession = []\nPROF = re.compile(r'.*\\b([Pp]rofessor|[Bb]anker|[Pp]rogrammer|[Aa]nalyst|[Aa]ssociate|[Hh]ead[master?]|[Cc]onsultant|[Cc]hairman|[Dd]octorate|[Aa]ccountant|[Ff]reelance|[Pp]artner|[Mm]anager|[Tt]eacher|[Ll]awyer|[Pp]resident|[Dd]ean|[Ee]ngineer|[Aa]ssistant|[Dd]irector|[Ee]xecutive)\\b')\nprofession = extract_rels(tagged_1984, profession,'PERSON','ORGANIZATION',PROF)\nprint_rels(profession)",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#larger funnel\nprofession_v2 = []\nIS = re.compile(r'.*\\bis\\b')\nprofession_v2 = extract_rels(tagged_1984, profession_v2,'PERSON','ORGANIZATION',IS)\nprint_rels(profession_v2)",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "education = []\nEDU = re.compile(r'.*\\b([Dd]egree|[M.B.A.]|[M.S.]|[M.D.]|[Dd]esigner|[Mm]aster\\'s|[Gg]raduate[d]?|complet[ing|ed|e])\\b')\neducation = extract_rels(tagged_1984, education,'PERSON','ORGANIZATION',EDU)\nprint_rels(education)",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#larger funnel\neducation_v2 = []\nFROM = re.compile(r'.*\\bfrom\\b')\neducation_v2 = extract_rels(tagged_1984, education_v2,'PERSON','ORGANIZATION',FROM)\nprint_rels(education_v2)",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Marries Extractors",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "marries = []\nMARRIES = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\n\nfor rel in nltk.sem.extract_rels('PERSON','ORGANIZATION', tree, pattern = MARRIES):\n    marries.append(nltk.sem.relextract.show_raw_rtuple(rel))",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "MARRIES = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\n\nfor rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = MARRIES):\n#     print nltk.sem.relextract.show_raw_rtuple(rel)\n    marries.append(nltk.sem.relextract.show_raw_rtuple(rel))",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "WED = re.compile(r'.*\\b[Ww]eds?\\b')\n\nfor rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = WED):\n#     print nltk.sem.relextract.show_raw_rtuple(rel)\n    marries.append(nltk.sem.relextract.show_raw_rtuple(rel))",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "ENAGEGEMNT = re.compile(r'.*\\b([Ee]ngaged|[Ee]ngagement|[Ee]gagment\\b)')\n\nfor rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = MARRIES):\n    marries.append(nltk.sem.relextract.show_raw_rtuple(rel))",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "AT = re.compile(r'.*\\b[Aa]t\\b')\n\nfor rel in nltk.sem.extract_rels('PERSON','LOCATION', tree, pattern = AT):\n    marries.append(nltk.sem.relextract.show_raw_rtuple(rel))",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Putting it all together:",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Set variable to collect info\nmarries = []\n\nfor doc in march1984_tagged:\n    #Parse every document \n    tree = chunker_rules(doc)\n    #Relationship Extractors - #1\n    MARRIES = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\n    for rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = MARRIES):\n        marries.append(nltk.sem.relextract.show_raw_rtuple(rel))\n    #Relationship Extractors - #2\n    MARRIESv2 = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\n    for rel in nltk.sem.extract_rels('PERSON','ORGANIZATION', tree, pattern = MARRIESv2):\n        marries.append(nltk.sem.relextract.show_raw_rtuple(rel))\n    #Relationship Extractors - #3\n    WED = re.compile(r'.*\\b[Ww]eds?\\b')\n    for rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = WED):\n        marries.append(nltk.sem.relextract.show_raw_rtuple(rel))\n    #Relationship Extractors - #4\n    ENGAGEMENT = re.compile(r'.*\\b([Ee]ngaged|[Ee]ngagement|[Ee]gagment\\b)')\n    for rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = ENGAGEMENT):\n        marries.append(nltk.sem.relextract.show_raw_rtuple(rel))\n    ",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "print len(marries)\n\nfor i in marries:\n    print i + '\\n'",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#Set variable to collect info\nmarriage_location = []\n\nfor doc in march1984_tagged:\n    #Parse every document \n    tree = chunker_rules(doc)\n    #Relationship Extractors - #1\n    AT = re.compile(r'.*\\b[Aa]t\\b')\n    for rel in nltk.sem.extract_rels('PERSON','ORGANIZATION', tree, pattern = AT):\n        marriage_location.append(nltk.sem.relextract.show_raw_rtuple(rel))\n    for rel in nltk.sem.extract_rels('LOCATION','ORGANIZATION', tree, pattern = AT):\n        marriage_location.append(nltk.sem.relextract.show_raw_rtuple(rel))\n    for rel in nltk.sem.extract_rels('PERSON','LOCATION', tree, pattern = AT):\n        marriage_location.append(nltk.sem.relextract.show_raw_rtuple(rel))\n    for rel in nltk.sem.extract_rels('LOCATION','PERSON', tree, pattern = AT):\n        marriage_location.append(nltk.sem.relextract.show_raw_rtuple(rel))",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "for i in marriage_location:\n    print i + '\\n'",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "marriage_location_dict = {'marriage_loc':[]}\nloc = []\nother = []\nfor i in marriage_location:\n    if 'performed' in i or 'arrie' in i or 'Weds' in i or 'officiate' in i or 'Temple' in i or 'Church' in i :\n#     if 'Church' in i or 'Temple' in i :\n        loc.append(i)\n        marriage_location_dict['marriage_loc'].append(i)\n    else:\n        other.append(i)",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "print len(marries)\nprint len (marriage_location)\nprint len(loc)\nprint len(other)",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "for i in loc:\n    print i + '\\n'",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "len(marriage_location_dict['marriage_loc'])",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "print marriage_location_dict",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    }
   ],
   "metadata": {}
  }
 ],
 "metadata": {
  "name": "",
  "signature": "sha256:3ecf8aab785c892aae24f34a1ca226d4b63a481d39098d3c4a3c45581cdca0f3",
  "gist_id": "f18f7431fc48d542efe7"
 },
 "nbformat": 3
 }