fayeip · October 18, 2014 15:47
diff --git a/Keyphrase Identification Assignment - Faye Ip.ipynb b/Keyphrase Identification Assignment - Faye Ip.ipynb
 {
 "worksheets": [
  {
   "cells": [
    {
     "metadata": {},
     "cell_type": "code",
     "input": "import nltk\nimport string \nfrom nltk.collocations import *\nfrom nltk.corpus import wordnet as wn\nfrom nltk.corpus import brown\nfrom nltk.corpus import stopwords\nfrom math import log\nfrom collections import Counter \nfrom tabulate import tabulate\n",
     "prompt_number": 316,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "macbeth_sents = nltk.corpus.gutenberg.sents('shakespeare-macbeth.txt')\ncaesar_sents = nltk.corpus.gutenberg.sents('shakespeare-caesar.txt')\nhamlet_sents = nltk.corpus.gutenberg.sents('shakespeare-hamlet.txt') \n\ndef remove_stage_directions(sents):\n    sents_wo_stage_dir = []\n    for i in range(1,len(sents)):\n        if len(sents[i]) == 2 and sents[i][1] == '.':   #remove character names as part of stage directions\n            continue \n        if sents[i][0] != 'Actus' and sents[i][0] != 'Scoena' and sents[i][0] != 'Scena' and sents[i][0] != 'Scaena' and sents[i][0] != 'Enter' and sents[i][0] != 'Exeunt' and sents[i][0] != 'Exit':\n            sents_wo_stage_dir.append(sents[i])\n    return sents_wo_stage_dir \n\nmacbeth_sents_wo_stage_dir = remove_stage_directions(macbeth_sents)\ncaesar_sents_wo_stage_dir = remove_stage_directions(caesar_sents)\nhamlet_sents_wo_stage_dir = remove_stage_directions(hamlet_sents)\n\n",
     "prompt_number": 317,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "brown_words = brown.words(categories='news')\nbrown_sents = brown.sents(categories='news')\nbrown_paras = brown.paras(categories='news')\n",
     "prompt_number": 318,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "markdown",
     "source": "Technique 1: Most Frequent Trigrams, Bigrams, Unigrams"
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "\ndef get_cleaned_words(unclean_sents):\n    cleaned_words = []\n    for i in range(len(unclean_sents)):\n        for j in range(len(unclean_sents[i])):\n            if len(unclean_sents[i][j]) == 2 and not unclean_sents[i][j][0].isalnum() and not unclean_sents[i][j][1].isalnum():  \n            #to get rid of weird double punctuations like \"''\"  \n                continue \n            if unclean_sents[i][j] not in string.punctuation and unclean_sents[i][j].lower() not in stopwords.words('english'):\n                cleaned_words.append(unclean_sents[i][j].lower())\n    return cleaned_words\n\ndef get_ngrams_fds(cleaned_words):\n    bg = nltk.bigrams(cleaned_words)\n    tg = nltk.trigrams(cleaned_words)\n    unigram_fd = nltk.FreqDist(cleaned_words)\n    bigram_fd = nltk.FreqDist(bg)\n    trigram_fd = nltk.FreqDist(tg)\n    return trigram_fd, bigram_fd, unigram_fd\n\n",
     "prompt_number": 319,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "macbeth_words_cleaned_nopunc = get_cleaned_words(macbeth_sents_wo_stage_dir)\ncaesar_words_cleaned_nopunc = get_cleaned_words(caesar_sents_wo_stage_dir)\nhamlet_words_cleaned_nopunc = get_cleaned_words(hamlet_sents_wo_stage_dir)\n\nmacbeth_trigram_fd, macbeth_bigram_fd, macbeth_unigram_fd = get_ngrams_fds(macbeth_words_cleaned_nopunc)\ncaesar_trigram_fd, caesar_bigram_fd, caesar_unigram_fd = get_ngrams_fds(caesar_words_cleaned_nopunc)\nhamlet_trigram_fd, hamlet_bigram_fd, hamlet_unigram_fd = get_ngrams_fds(hamlet_words_cleaned_nopunc)\n\n#create list for printing with tabulate()\ndef print_ngrams_with_tabulate(tgfd, bgfd, ugfd):\n    list_for_printing = []\n    for i in range(20):\n        list_for_printing.append([\" \".join(tgfd.keys()[i]), \" \".join(bgfd.keys()[i]), ugfd.keys()[i]])\n    print tabulate(list_for_printing, headers=[\"Trigrams\", \"Bigrams\", \"Unigrams\"])\n\nprint \"RESULTS FOR MACBETH\"\nprint_ngrams_with_tabulate(macbeth_trigram_fd, macbeth_bigram_fd, macbeth_unigram_fd)\nprint \" \"\nprint \"RESULTS FOR CAESAR\"\nprint_ngrams_with_tabulate(caesar_trigram_fd, caesar_bigram_fd, caesar_unigram_fd)\nprint \" \"\nprint \"RESULTS FOR HAMLET\"\nprint_ngrams_with_tabulate(hamlet_trigram_fd, hamlet_bigram_fd, hamlet_unigram_fd)\nprint \" \"",
     "prompt_number": 320,
     "outputs": [
      {
       "output_type": "stream",
       "text": "RESULTS FOR MACBETH\nTrigrams               Bigrams       Unigrams\n---------------------  ------------  ----------\nburne cauldron bubble  o th          d\nfire burne cauldron    thane cawdor  haue\nile doe ile            st thou       thou\nthou speak st          thou art      shall\ntrouble fire burne     good lord     vpon\nappar macbeth macbeth  haue done     thee\nbyrnane wood come      let vs        th\ncauldron bubble 2      wee l         vs\nchildren shall kings   knock knock   yet\ncome come come         o re          thy\ndoe ile doe            would st      come\ndouble double toyle    call d        would\ndouble toyle trouble   euery one     hath\ngod blesse vs          make vs       good\ngood lord haue         mine eyes     time\ngood lord time         mine owne     macbeth\nhaile king scotland    murther d     like\nhaile macbeth haile    thy selfe     let\nhaile thee thane       worthy thane  st\nhaue done harme        would haue    say",
       "stream": "stdout"
      },
      {
       "output_type": "stream",
       "text": "\n \nRESULTS FOR CAESAR\nTrigrams                     Bigrams       Unigrams\n---------------------------  ------------  ----------\nwee l heare                  let vs        d\nbeware ides march            wee l         caesar\nbrutus honourable man        mark antony   haue\nbrutus sayes ambitious       marke antony  brutus\ncaesar lou d                 st thou       shall\nhath done deed               thou art      thou\nlet vs heare                 would haue    cassius\nmark antony shall            art thou      come\nmine owne part               caesar shall  good\nthee thou st                 good night    let\nthou sleep st                noble brutus  o\nambitious brutus honourable  thou hast     know\nanswer euery man             d caesar      men\nantony noble antony          good morrow   antony\nbid giue thee                haue done     vs\nbreefely wisely truly        caesar caes   heere\nbrutus thou sleep            mou d         man\ncaesar shall go              shall finde   thy\ncaesar thou art              thou st       thee\ncaesars funerall friend      antony shall  vpon",
       "stream": "stdout"
      },
      {
       "output_type": "stream",
       "text": "\n \nRESULTS FOR HAMLET\nTrigrams                        Bigrams        Unigrams\n------------------------------  -------------  ----------\nwould st thou                   good lord      d\nsit downe let                   o re           lord\nburied christian buriall        wee l          haue\ncharge thee speake              haue seene     shall\nclay made guest                 lord hamlet    thou\ncomicall historicall pastorall  haue heard     come\nd drown d                       lord haue      let\ndeere brothers death            st thou        good\ndost thou heare                 thou hast      thy\ndowne let vs                    fathers death  hamlet\ndrown d drown                   dost thou      oh\ndye sleepe sleepe               good friends   like\nere go bed                      ile haue       would\nfather much offended            let see        know\ngod blesse sir                  let vs         well\ngod buy ye                      set downe      tis\ngoe ile follow                  thou art       king\ngood friends oh                 well lord      selfe\nhamlet good madam               would haue     o\nhamlet thou hast                drown d        loue",
       "stream": "stdout"
      },
      {
       "output_type": "stream",
       "text": "\n \n",
       "stream": "stdout"
      }
     ],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "brown_words_nopunc = get_cleaned_words(brown_sents)\n\nbrown_trigram_fd, brown_bigram_fd, brown_unigram_fd = get_ngrams_fds(brown_words_nopunc)\n\nprint \"RESULTS FOR BROWN\"\nprint_ngrams_with_tabulate(brown_trigram_fd, brown_bigram_fd, brown_unigram_fd)\nprint \" \" ",
     "prompt_number": 321,
     "outputs": [
      {
       "output_type": "stream",
       "text": "RESULTS FOR BROWN\nTrigrams                   Bigrams             Unigrams\n-------------------------  ------------------  ----------\nmr. hawksley said          new york            said\nnew york city              per cent            mrs.\n10 per cent                mr. mrs.            would\nfour home runs             united states       new\nhome rule charter          last week           one\nnew york yankees           last year           last\n4 per cent                 white house         two\naged care plan             high school         mr.\namerican catholic higher   home runs           first\ncatholic higher education  u. s.               state\nla dolce vita              president kennedy   president\nnational football league   last night          year\nper cent interest          said would          home\npotato chip industry       san francisco       also\ntwo years ago              years ago           made\n12 months ended            anti-trust laws     time\n60 home runs               mr. kennedy         years\nannapolis jan. 7           kansas city         three\nanne arundel county        premier khrushchev  house\nannounce birth daughter    los angeles         week",
       "stream": "stdout"
      },
      {
       "output_type": "stream",
       "text": "\n \n",
       "stream": "stdout"
      }
     ],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "markdown",
     "source": "    \n   \nTechnique 2: PMI and Chi-squared Collocations\n"
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "def find_colloc_pmi_chisq(words_cleaned_nopunc):\n    bigram_measures = nltk.collocations.BigramAssocMeasures()\n    trigram_measures = nltk.collocations.TrigramAssocMeasures()\n    finder1 = BigramCollocationFinder.from_words(words_cleaned_nopunc)\n    finder2 = TrigramCollocationFinder.from_words(words_cleaned_nopunc)\n    finder1.apply_freq_filter(2)\n    finder2.apply_freq_filter(2)\n    bg_pmi = finder1.nbest(bigram_measures.pmi, 20)\n    bg_chisq = finder1.nbest(bigram_measures.chi_sq, 20)\n    tg_pmi = finder2.nbest(trigram_measures.pmi, 20)\n    tg_chisq = finder2.nbest(trigram_measures.chi_sq, 20)\n    return bg_pmi, bg_chisq, tg_pmi, tg_chisq\n\nmacbeth_bg_pmi, macbeth_bg_chisq, macbeth_tg_pmi, macbeth_tg_chisq = find_colloc_pmi_chisq(macbeth_words_cleaned_nopunc)\ncaesar_bg_pmi, caesar_bg_chisq, caesar_tg_pmi, caesar_tg_chisq = find_colloc_pmi_chisq(caesar_words_cleaned_nopunc)\nhamlet_bg_pmi, hamlet_bg_chisq, hamlet_tg_pmi, hamlet_tg_chisq = find_colloc_pmi_chisq(hamlet_words_cleaned_nopunc)\n\n",
     "prompt_number": 322,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "def print_ngram_pmi_chisq_with_tabulate(tg_pmi, tg_chisq, bg_pmi, bg_chisq):\n    list_for_printing = []\n    for i in range(20):\n        list_for_printing.append([\" \".join(tg_pmi[i]), \" \".join(tg_chisq[i]), \" \".join(bg_pmi[i]), \" \".join(bg_chisq[i])])\n    print tabulate(list_for_printing, headers=[\"Trigram-PMI\", \"Trigram-ChiSq\", \"Bigram-PMI\", \"Bigram-ChiSq\"])",
     "prompt_number": 323,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "print \"RESULTS FOR MACBETH\"\nprint_ngram_pmi_chisq_with_tabulate(macbeth_tg_pmi, macbeth_tg_chisq, macbeth_bg_pmi, macbeth_bg_chisq)\nprint \" \"\nprint \"RESULTS FOR CAESAR\"\nprint_ngram_pmi_chisq_with_tabulate(caesar_tg_pmi, caesar_tg_chisq, caesar_bg_pmi, caesar_bg_chisq)\nprint \" \"\nprint \"RESULTS FOR HAMLET\"\nprint_ngram_pmi_chisq_with_tabulate(hamlet_tg_pmi, hamlet_tg_chisq, hamlet_bg_pmi, hamlet_bg_chisq)\nprint \" \"",
     "prompt_number": 324,
     "outputs": [
      {
       "output_type": "stream",
       "text": "RESULTS FOR MACBETH\nTrigram-PMI            Trigram-ChiSq          Bigram-PMI         Bigram-ChiSq\n---------------------  ---------------------  -----------------  -----------------\nburne cauldron bubble  children shall kings   musicke song       musicke song\nfire burne cauldron    wilt thou father       prince cumberland  prince cumberland\ntoyle trouble fire     lesse deseru d         thunder lightning  ten thousand\ntrouble fire burne     seyward ten thousand   ten thousand       dy de\nseyward ten thousand   cauldron bubble 2      dy de              y sey\ndouble toyle trouble   toyle trouble fire     y sey              thunder lightning\ncauldron bubble 2      ring alarum bell       drum colours       burne cauldron\nring alarum bell       till byrnane wood      pallace gate       cauldron bubble\ndouble double toyle    byrnane wood come      burne cauldron     fire burne\ntill byrnane wood      double toyle trouble   cauldron bubble    weyward sisters\nknock knock knock      double double toyle    fire burne         wee l\nbyrnane wood come      old man tis            toyle trouble      drum colours\ngod blesse vs          macbeth haile thee     weyward sisters    pallace gate\nwilt thou father       looke like th          remoue meanes      thane cawdor\nlesse deseru d         ile doe ile            england ireland    toyle trouble\nwood come dunsinane    trouble fire burne     laugh scorne       remoue meanes\nhaile king scotland    fire burne cauldron    ne re              trouble fire\nthane cawdor liues     burne cauldron bubble  lesse deseru       england ireland\nchildren shall kings   god blesse vs          sorry sight        ne re\nappar macbeth macbeth  appar macbeth macbeth  weyard sisters     laugh scorne\n \nRESULTS FOR CAESAR\nTrigram-PMI               Trigram-ChiSq             Bigram-PMI         Bigram-ChiSq\n------------------------  ------------------------  -----------------  -----------------\ngoing dwell married       ill temper d              ciuill strife      ciuill strife\ngraunt woman withall      bid giue thee             harme intended     wee l\nbreefely wisely truly     man directly breefely     itching palme      harme intended\nteare bad verses          caesars funerall friend   popillius lena     itching palme\nwoman withall woman       cicero one cicero         dwell married      popillius lena\nwhether going dwell       euer euer farewell        low alarums        ne re\ndirectly breefely wisely  ple plucke downe          falling sicknesse  ides march\nbeware ides march         therein yee gods          varrus claudio     dwell married\nsonne marcus cato         ride ride messala         ne re              low alarums\nmarcus cato hoe           name whether going        pompeyes porch     falling sicknesse\ntherein yee gods          marcus cato hoe           therein yee        varrus claudio\ndwell married man         sonne marcus cato         went schoole       beware ides\nride ride messala         directly breefely wisely  graunt woman       pompeyes porch\nname whether going        whether going dwell       breefely wisely    went schoole\nple plucke downe          breefely wisely truly     liberty freedome   therein yee\nspeake strike redresse    woman withall woman       beware ides        market place\nsleep st awake            graunt woman withall      thunder lightning  metellus cymber\nwee l burne               going dwell married       withall woman      caius ligarius\nparting well made         teare bad verses          woman withall      graunt woman\nman directly breefely     dwell married man         cinna poet         breefely wisely\n \nRESULTS FOR HAMLET\nTrigram-PMI                     Trigram-ChiSq                   Bigram-PMI             Bigram-ChiSq\n------------------------------  ------------------------------  ---------------------  ---------------------\nmason shipwright carpenter      neuer speake haue               barbary horses         barbary horses\ncomicall historicall pastorall  inobled queene good             mason shipwright       mason shipwright\nclay made guest                 father much offended            shipwright carpenter   shipwright carpenter\nburied christian buriall        o pit clay                      comicall historicall   e ene\nmade guest meete                ere go bed                      builds stronger        comicall historicall\npit clay made                   vse art mad                     guest meete            behinde arras\no pit clay                      pit clay made                   clowne sings           ee n\ndye sleepe sleepe               sweare sword sweare             historicall pastorall  closes consequence\nsweare sword sweare             comicall historicall pastorall  pit clay               builds stronger\ngod buy ye                      clay made guest                 behinde arras          guest meete\nvse art mad                     made guest meete                ee n                   clowne sings\ngod blesse sir                  mason shipwright carpenter      cocke crew             historicall pastorall\nfather much offended            buried christian buriall        fro m                  pit clay\ndeere brothers death            god buy ye                      heau n                 cocke crew\ninobled queene good             dye sleepe sleepe               nine yeare             fro m\ndrown d drown                   deere brothers death            closes consequence     heau n\nere go bed                      king haue letters               e ene                  nine yeare\nwee l put                       goe ile follow                  sixe french            wee l\nsit downe let                   god blesse sir                  white snow             christian buriall\ngoe ile follow                  charge thee speake              powres poyson          sixe french\n \n",
       "stream": "stdout"
      }
     ],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "brown_bg_pmi, brown_bg_chisq, brown_tg_pmi, brown_tg_chisq = find_colloc_pmi_chisq(brown_words_nopunc)\n\nprint \"RESULTS FOR BROWN\"\nprint_ngram_pmi_chisq_with_tabulate(brown_tg_pmi, brown_tg_chisq, brown_bg_pmi, brown_bg_chisq)\nprint \" \"",
     "prompt_number": 325,
     "outputs": [
      {
       "output_type": "stream",
       "text": "RESULTS FOR BROWN\nTrigram-PMI                          Trigram-ChiSq                      Bigram-PMI            Bigram-ChiSq\n-----------------------------------  ---------------------------------  --------------------  -------------------\nku klux klan                         prince souvanna phouma             bateau neckline       viet nam\npinar del rio                        jury room said                     cape cod              hong kong\nhyannis port mass.                   three sisters mrs.                 corpus christi        dolce vita\n58th precinct 23d                    washington moscow last             del rio               notre dame\nfort knox ky.                        despite mr. rusk's                 dimes quarters        scottish rite\njudges 58th precinct                 w. buchheister president           ethnic groupings      duncan phyfe\nracial discrimination employment     first atomic submarine             hays kan.             sterling township\nmakes cleaning drying                seeing hit home                    hazards pedestrians   bateau neckline\nelectronic data processing           entertained luncheon home          klux klan             cape cod\ntraffic hazards pedestrians          members book club                  ku klux               corpus christi\nnotre dame chapter                   national christian family          pinar del             del rio\nprince souvanna phouma               american friends service           puerto rico           dimes quarters\nstephanie shaw hillsboro             big four summit                    scenic effects        ethnic groupings\nla dolce vita                        foreign relations committee        tanks artillery       hays kan.\nprecinct 23d ward                    yards per game                     wasteful duplication  hazards pedestrians\nuniversity's charter by-laws         relations national democratic      bake slowly           klux klan\nendowments institutions established  since august 1                     chisholm trail        ku klux\nguilty reckless driving              national council jewish            cleaning drying       pinar del\nduncan phyfe furniture               armed services committee           clinton bowman        puerto rico\nconvicted engaging conspiracy        association university professors  coral gables          scenic effects\n \n",
       "stream": "stdout"
      }
     ],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "markdown",
     "source": "Technique 3: TF-IDF"
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "# tf-idf of each word with respect to a document: (1 + log(tf)) * log(N/df)\n\n\ndef compute_idf_scores(collec_list, collec_set):\n    doc_freq_counter = Counter()\n    num_of_docs = len(collec_list)\n    for word in collec_set:\n        for i in range(len(collec_list)):\n            if word in collec_list[i]:\n                doc_freq_counter[word] += 1 \n    idf_scores_dict = {} \n    for word in doc_freq_counter:\n        idf_scores_dict[word] = log ( float(num_of_docs) / float(doc_freq_counter[word]) )                \n    return idf_scores_dict \n\ndef compute_tf_idf(document, idf_scores_dict):\n    words_fd = nltk.FreqDist(document)\n    tf_idf = Counter()    #storing in a Counter() rather than dict so that i can add Counters and use most_common() later \n    for word in words_fd:\n        tf_idf[word] = (1.0 + log( float(words_fd[word]) )) * idf_scores_dict[word]\n    return tf_idf\n\ndef print_tf_idf_with_tabulate(tfidf_list):\n    list_for_printing = []\n    for word, score in tfidf_list:\n        list_for_printing.append([word, score])\n    print tabulate(list_for_printing, headers=[\"Word\", \"TF-IDF Score\"])\n\n",
     "prompt_number": 326,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "shakespeare_collection_list = [macbeth_words_cleaned_nopunc] + [caesar_words_cleaned_nopunc] + [hamlet_words_cleaned_nopunc]\nshakespeare_collection_set = set(macbeth_words_cleaned_nopunc + caesar_words_cleaned_nopunc + hamlet_words_cleaned_nopunc) \nshakespeare_idf_scores = compute_idf_scores(shakespeare_collection_list, shakespeare_collection_set)\n\nmacbeth_tf_idf = compute_tf_idf(macbeth_words_cleaned_nopunc, shakespeare_idf_scores)\ncaesar_tf_idf = compute_tf_idf(caesar_words_cleaned_nopunc, shakespeare_idf_scores)\nhamlet_tf_idf = compute_tf_idf(hamlet_words_cleaned_nopunc, shakespeare_idf_scores)\n\nprint \"RESULTS FOR MACBETH\"\nprint_tf_idf_with_tabulate(macbeth_tf_idf.most_common()[0:20])\nprint \" \"\nprint \"RESULTS FOR CAESAR\"\nprint_tf_idf_with_tabulate(caesar_tf_idf.most_common()[0:20])\nprint \" \"\nprint \"RESULTS FOR HAMLET\"\nprint_tf_idf_with_tabulate(hamlet_tf_idf.most_common()[0:20])\nprint \" \"\n",
     "prompt_number": 327,
     "outputs": [
      {
       "output_type": "stream",
       "text": "RESULTS FOR MACBETH\nWord         TF-IDF Score\n---------  --------------\nmacbeth           5.28066\nbanquo            4.75941\nthane             4.63491\ncawdor            4.44336\nscotland          3.82856\nduncan            3.62826\nmacduffe          3.62826\nfleans            3.38311\ndunsinane         3.38311\nmurth             3.23641\nmacduff           3.06706\npowre             3.06706\nmalcolme          3.06706\nenglish           3.06706\nthanes            2.86676\ncauldron          2.86676\nseyward           2.86676\ntitle             2.86676\ntoth              2.86676\nhorror            2.86676\n \nRESULTS FOR CAESAR\nWord         TF-IDF Score\n---------  --------------\ncassius           5.85641\nantony            5.68465\ncaesars           5.06561\ncaes              4.79796\ncaska             4.678\ntitinius          4.38976\nmessala           4.33341\noctauius          4.33341\nromans            4.27401\nlucius            4.27401\ncapitoll          4.21122\ncaius             3.9165\nphilippi          3.73297\ncinna             3.73297\npindarus          3.73297\nlucillius         3.62826\nmetellus          3.62826\ncymber            3.51251\ndecius            3.51251\nportia            3.51251\n \nRESULTS FOR HAMLET\nWord             TF-IDF Score\n-------------  --------------\nhamlet                5.9532\nhoratio               4.87123\nlaertes               4.79796\nophelia               4.44336\ndenmarke              4.33341\ngertrude              3.82856\nverie                 3.73297\nexcellent             3.73297\nfortinbras            3.73297\npyrrhus               3.62826\nguildensterne         3.62826\nphrase                3.51251\nlaw                   3.23641\nhamlets               3.23641\ne                     3.23641\nscull                 3.06706\ndanish                3.06706\nrosincrance           3.06706\ndrown                 3.06706\nmaid                  3.06706",
       "stream": "stdout"
      },
      {
       "output_type": "stream",
       "text": "\n \n",
       "stream": "stdout"
      }
     ],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "def create_collection_list(words_nopunc):   \n    #this is my little hack workaround to the fact that the Brown corpus isn't neatly divided into articles \n    #i'm dividing up all words into \"documents\" of 5,000 words each \n    collection_list = []\n    for i in range(0,len(words_nopunc),5000):\n        if i+5000 <= len(words_nopunc):\n            collection_list.append(words_nopunc[i:i+5000])\n    collection_list.append(words_nopunc[((len(words_nopunc)//5000)*5000):len(words_nopunc)])\n    return collection_list \n\n\nbrown_collection_list = create_collection_list(brown_words_nopunc) \nbrown_collection_set = set(brown_words_nopunc)    \nbrown_idf_scores = compute_idf_scores(brown_collection_list, brown_collection_set)\n\nbrown_tf_idf_list = []\nfor i in range(len(brown_collection_list)):\n    doc_tfidf = compute_tf_idf(brown_collection_list[i], brown_idf_scores)\n    brown_tf_idf_list.append(doc_tfidf)\n",
     "prompt_number": 328,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "brown_tf_idf_total = Counter()\nfor counter in brown_tf_idf_list:   #sum up all tf-idf scores for each word across collection\n    brown_tf_idf_total = brown_tf_idf_total + counter\n\nprint \"RESULTS FOR BROWN\"\nprint_tf_idf_with_tabulate(brown_tf_idf_total.most_common()[0:20])\nprint \" \"",
     "prompt_number": 329,
     "outputs": [
      {
       "output_type": "stream",
       "text": "RESULTS FOR BROWN\nWord         TF-IDF Score\n---------  --------------\nmantle            13.7066\nlibrary           13.1159\nruns              12.4353\ndallas            12.3768\ngame              12.2506\nsenate            12.0787\nbaseball          12.0233\nlaos              12.0209\njury              11.9262\ngames             11.9193\npalmer            11.8697\ntax               11.8277\nyesterday         11.3573\nseason            11.2245\navenue            11.219\ncongo             10.9629\nteam              10.9073\nyankees           10.8819\ngeorgia           10.7556\nmaris             10.6027",
       "stream": "stdout"
      },
      {
       "output_type": "stream",
       "text": "\n \n",
       "stream": "stdout"
      }
     ],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "markdown",
     "source": "Apply All Techniques on Mystery Text"
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "f = open('mystery.txt','r')\nmystery = f.read()\nmystery_articles = mystery.split('\\n  \\n\\n')\n\nmystery_words = nltk.word_tokenize(mystery)\nmystery_articles_tokenized = []\nfor article in mystery_articles:\n    tokenized = nltk.word_tokenize(article)\n    mystery_articles_tokenized.append(tokenized)\n\ndef modified_get_cleaned_words(unclean_words):   #modifying get_cleaned_words() function from above. \n                                                 #That one was written to take sents as input. Here I have words as input.\n    cleaned_words = []\n    for i in range(len(unclean_words)):\n        if len(unclean_words[i]) == 2 and not unclean_words[i][0].isalnum() and not unclean_words[i][1].isalnum():  #to get rid of weird double punctuations like \"''\"  \n            continue \n        if unclean_words[i] not in string.punctuation and unclean_words[i].lower().strip(string.punctuation) not in stopwords.words('english'):\n            cleaned_words.append(unclean_words[i].lower().strip(string.punctuation))\n    return cleaned_words\n\nmystery_words_cleaned_nopunc = modified_get_cleaned_words(mystery_words)   #clean mystery_words\nmystery_articles_cleaned_nopunc = []   #clean mystery_articles_tokenized \nfor article in mystery_articles_tokenized:\n    mystery_articles_cleaned_nopunc.append(modified_get_cleaned_words(article))\n\n",
     "prompt_number": 330,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "# Technique 1: Frequent N-Grams \nmystery_trigram_fd, mystery_bigram_fd, mystery_unigram_fd = get_ngrams_fds(mystery_words_cleaned_nopunc)\nprint \"RESULTS FOR MYSTERY\"\nprint_ngrams_with_tabulate(mystery_trigram_fd, mystery_bigram_fd, mystery_unigram_fd)\nprint \" \"",
     "prompt_number": 331,
     "outputs": [
      {
       "output_type": "stream",
       "text": "RESULTS FOR MYSTERY\nTrigrams                     Bigrams                 Unigrams\n---------------------------  ----------------------  ----------\nmln tonnes vs                mln tonnes              said\nu.s agriculture department   last month              mln\nmln tonnes last              mln dlrs                pct\ntonnes last month            billion dlrs            tonnes\ntrade sources said           sources said            u.s\nlast month exports           mln barrels             dlrs\nweek ended march             new york                dollar\nagriculture department said  last year               last\ntonnes free market           department said         trade\necus per tonne               bank japan              would\nmln last month               traders said            oil\nlast month usda              tonnes vs               wheat\nmonth exports 1985/86        u.s agriculture         japan\npct rise january             pct sulphur             year\nfree market barley           dealers said            yen\nlast month stocks            week ended              new\npct year ago                 agriculture department  prices\nbank japan intervenes        crude oil               market\ndlrs 75 cts                  heating oil             coffee\ndlrs fob gulf                official said           bank",
       "stream": "stdout"
      },
      {
       "output_type": "stream",
       "text": "\n \n",
       "stream": "stdout"
      }
     ],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "# Technique 2: PMI and Chi-squared Collocations\nmystery_bg_pmi, mystery_bg_chisq, mystery_tg_pmi, mystery_tg_chisq = find_colloc_pmi_chisq(mystery_words_cleaned_nopunc)\nprint \"RESULTS FOR MYSTERY\"\nprint_ngram_pmi_chisq_with_tabulate(mystery_tg_pmi, mystery_tg_chisq, mystery_bg_pmi, mystery_bg_chisq)\nprint \" \"",
     "prompt_number": 332,
     "outputs": [
      {
       "output_type": "stream",
       "text": "RESULTS FOR MYSTERY\nTrigram-PMI                      Trigram-ChiSq                Bigram-PMI         Bigram-ChiSq\n-------------------------------  ---------------------------  -----------------  -----------------------\n274 264 groundnutseed            undermine confidence future  04/09/87 03/09/87  buenos aires\nbarney harris upham              2,000/2,000 apr egypt        10.8 13.8          van horick\nentre rios misiones              could undermine confidence   103 102            santa fe\nexpellers 103 102                fraternite matin said        20-30 5,000-7,000  cape spencer\nfabricated mica consultancy      move said production         21.20 dlrs/barrel  shearson lehman\nfed's 5-1/2 8-1/2                27 said stocks               21.9 25.8          dean witter\ngems castor sandalwood           expected average said        22.2 23.4          merrill lynch\njuices purees pulp               said production conditional  23.8 29.3          excluded countertrading\nlaying golden egg                crop said generally          264 groundnutseed  hrs edt\npath signalling imminent         cts said raising             274 264            nihon keizai\npurees pulp tomato               said movement since          323 313            1,500/1,700 13-20/4\nwen wei po                       said sunflower maize         36.0 cents/gallon  13-20/4 toledo/seaforth\n36.0 cents/gallon chemical       said view g-7                43.9 48.8          2,000/2,000 apr\n54.5 dual purpose                minister said movement       49.5 0.125         25-4/5-5 naantali/saudi\ncake expellers 103               inc reported said            5-1/2 8-1/2        4,000/3,000 20-30/4\ndepartment's counselor belgrade  said farm issue              52-57 d.i          bahia blanca\nfertile land insect              ltd zurich said              54.5 dual          burro creek\nhighly visible drama             agent country said           83 mm              daps 24-27/4\nkept players sidelined           country said agent           93.90 94.49        days/8,000 13-15/4\nonwards 330 320                  said import barriers         asbestos fibre     enquiries antwerp/libya\n \n",
       "stream": "stdout"
      }
     ],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "# Technique 3: TF-IDF\n\nmystery_idf_scores = compute_idf_scores(mystery_articles_cleaned_nopunc, set(mystery_words_cleaned_nopunc))\n\nmystery_tf_idf_list = []\nfor i in range(len(mystery_articles_cleaned_nopunc)):\n    doc_tfidf = compute_tf_idf(mystery_articles_cleaned_nopunc[i], mystery_idf_scores)\n    mystery_tf_idf_list.append(doc_tfidf)\n    \nmystery_tf_idf_total = Counter()\nfor counter in mystery_tf_idf_list:   \n    mystery_tf_idf_total = mystery_tf_idf_total + counter\n\nprint \"RESULTS FOR MYSTERY\"\nprint_tf_idf_with_tabulate(mystery_tf_idf_total.most_common()[0:20])\nprint \" \"\n",
     "prompt_number": 333,
     "outputs": [
      {
       "output_type": "stream",
       "text": "RESULTS FOR MYSTERY\nWord       TF-IDF Score\n-------  --------------\nmln             330.91\npct             314.53\ntonnes          314.158\ndollar          272.326\ndlrs            263.61\nu.s             252.779\nyen             249.776\noil             245.817\njapan           241.153\nwheat           239.961\ntrade           239.15\nwould           238.462\nlast            222.119\nprices          221.353\nmarket          220.56\nbank            217.115\ncoffee          216.79\nyear            216.131\nbillion         208.947\nnew             208.251\n \n",
       "stream": "stdout"
      }
     ],
     "language": "python",
     "trusted": true,
     "collapsed": false
    }
   ],
   "metadata": {}
  }
 ],
 "metadata": {
  "name": "",
  "signature": "sha256:5ff3f7c28f62e728da97505ae9e526c0d0c09ea51a158fe033094a901e9e1a8d"
 },
 "nbformat": 3
 }