Created
October 18, 2014 15:47
-
-
Save fayeip/196400f9217fa41c3625 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "import nltk\nimport string \nfrom nltk.collocations import *\nfrom nltk.corpus import wordnet as wn\nfrom nltk.corpus import brown\nfrom nltk.corpus import stopwords\nfrom math import log\nfrom collections import Counter \nfrom tabulate import tabulate\n", | |
"prompt_number": 316, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "macbeth_sents = nltk.corpus.gutenberg.sents('shakespeare-macbeth.txt')\ncaesar_sents = nltk.corpus.gutenberg.sents('shakespeare-caesar.txt')\nhamlet_sents = nltk.corpus.gutenberg.sents('shakespeare-hamlet.txt') \n\ndef remove_stage_directions(sents):\n sents_wo_stage_dir = []\n for i in range(1,len(sents)):\n if len(sents[i]) == 2 and sents[i][1] == '.': #remove character names as part of stage directions\n continue \n if sents[i][0] != 'Actus' and sents[i][0] != 'Scoena' and sents[i][0] != 'Scena' and sents[i][0] != 'Scaena' and sents[i][0] != 'Enter' and sents[i][0] != 'Exeunt' and sents[i][0] != 'Exit':\n sents_wo_stage_dir.append(sents[i])\n return sents_wo_stage_dir \n\nmacbeth_sents_wo_stage_dir = remove_stage_directions(macbeth_sents)\ncaesar_sents_wo_stage_dir = remove_stage_directions(caesar_sents)\nhamlet_sents_wo_stage_dir = remove_stage_directions(hamlet_sents)\n\n", | |
"prompt_number": 317, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "brown_words = brown.words(categories='news')\nbrown_sents = brown.sents(categories='news')\nbrown_paras = brown.paras(categories='news')\n", | |
"prompt_number": 318, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "Technique 1: Most Frequent Trigrams, Bigrams, Unigrams" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "\ndef get_cleaned_words(unclean_sents):\n cleaned_words = []\n for i in range(len(unclean_sents)):\n for j in range(len(unclean_sents[i])):\n if len(unclean_sents[i][j]) == 2 and not unclean_sents[i][j][0].isalnum() and not unclean_sents[i][j][1].isalnum(): \n #to get rid of weird double punctuations like \"''\" \n continue \n if unclean_sents[i][j] not in string.punctuation and unclean_sents[i][j].lower() not in stopwords.words('english'):\n cleaned_words.append(unclean_sents[i][j].lower())\n return cleaned_words\n\ndef get_ngrams_fds(cleaned_words):\n bg = nltk.bigrams(cleaned_words)\n tg = nltk.trigrams(cleaned_words)\n unigram_fd = nltk.FreqDist(cleaned_words)\n bigram_fd = nltk.FreqDist(bg)\n trigram_fd = nltk.FreqDist(tg)\n return trigram_fd, bigram_fd, unigram_fd\n\n", | |
"prompt_number": 319, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "macbeth_words_cleaned_nopunc = get_cleaned_words(macbeth_sents_wo_stage_dir)\ncaesar_words_cleaned_nopunc = get_cleaned_words(caesar_sents_wo_stage_dir)\nhamlet_words_cleaned_nopunc = get_cleaned_words(hamlet_sents_wo_stage_dir)\n\nmacbeth_trigram_fd, macbeth_bigram_fd, macbeth_unigram_fd = get_ngrams_fds(macbeth_words_cleaned_nopunc)\ncaesar_trigram_fd, caesar_bigram_fd, caesar_unigram_fd = get_ngrams_fds(caesar_words_cleaned_nopunc)\nhamlet_trigram_fd, hamlet_bigram_fd, hamlet_unigram_fd = get_ngrams_fds(hamlet_words_cleaned_nopunc)\n\n#create list for printing with tabulate()\ndef print_ngrams_with_tabulate(tgfd, bgfd, ugfd):\n list_for_printing = []\n for i in range(20):\n list_for_printing.append([\" \".join(tgfd.keys()[i]), \" \".join(bgfd.keys()[i]), ugfd.keys()[i]])\n print tabulate(list_for_printing, headers=[\"Trigrams\", \"Bigrams\", \"Unigrams\"])\n\nprint \"RESULTS FOR MACBETH\"\nprint_ngrams_with_tabulate(macbeth_trigram_fd, macbeth_bigram_fd, macbeth_unigram_fd)\nprint \" \"\nprint \"RESULTS FOR CAESAR\"\nprint_ngrams_with_tabulate(caesar_trigram_fd, caesar_bigram_fd, caesar_unigram_fd)\nprint \" \"\nprint \"RESULTS FOR HAMLET\"\nprint_ngrams_with_tabulate(hamlet_trigram_fd, hamlet_bigram_fd, hamlet_unigram_fd)\nprint \" \"", | |
"prompt_number": 320, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "RESULTS FOR MACBETH\nTrigrams Bigrams Unigrams\n--------------------- ------------ ----------\nburne cauldron bubble o th d\nfire burne cauldron thane cawdor haue\nile doe ile st thou thou\nthou speak st thou art shall\ntrouble fire burne good lord vpon\nappar macbeth macbeth haue done thee\nbyrnane wood come let vs th\ncauldron bubble 2 wee l vs\nchildren shall kings knock knock yet\ncome come come o re thy\ndoe ile doe would st come\ndouble double toyle call d would\ndouble toyle trouble euery one hath\ngod blesse vs make vs good\ngood lord haue mine eyes time\ngood lord time mine owne macbeth\nhaile king scotland murther d like\nhaile macbeth haile thy selfe let\nhaile thee thane worthy thane st\nhaue done harme would haue say", | |
"stream": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "\n \nRESULTS FOR CAESAR\nTrigrams Bigrams Unigrams\n--------------------------- ------------ ----------\nwee l heare let vs d\nbeware ides march wee l caesar\nbrutus honourable man mark antony haue\nbrutus sayes ambitious marke antony brutus\ncaesar lou d st thou shall\nhath done deed thou art thou\nlet vs heare would haue cassius\nmark antony shall art thou come\nmine owne part caesar shall good\nthee thou st good night let\nthou sleep st noble brutus o\nambitious brutus honourable thou hast know\nanswer euery man d caesar men\nantony noble antony good morrow antony\nbid giue thee haue done vs\nbreefely wisely truly caesar caes heere\nbrutus thou sleep mou d man\ncaesar shall go shall finde thy\ncaesar thou art thou st thee\ncaesars funerall friend antony shall vpon", | |
"stream": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "\n \nRESULTS FOR HAMLET\nTrigrams Bigrams Unigrams\n------------------------------ ------------- ----------\nwould st thou good lord d\nsit downe let o re lord\nburied christian buriall wee l haue\ncharge thee speake haue seene shall\nclay made guest lord hamlet thou\ncomicall historicall pastorall haue heard come\nd drown d lord haue let\ndeere brothers death st thou good\ndost thou heare thou hast thy\ndowne let vs fathers death hamlet\ndrown d drown dost thou oh\ndye sleepe sleepe good friends like\nere go bed ile haue would\nfather much offended let see know\ngod blesse sir let vs well\ngod buy ye set downe tis\ngoe ile follow thou art king\ngood friends oh well lord selfe\nhamlet good madam would haue o\nhamlet thou hast drown d loue", | |
"stream": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "\n \n", | |
"stream": "stdout" | |
} | |
], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "brown_words_nopunc = get_cleaned_words(brown_sents)\n\nbrown_trigram_fd, brown_bigram_fd, brown_unigram_fd = get_ngrams_fds(brown_words_nopunc)\n\nprint \"RESULTS FOR BROWN\"\nprint_ngrams_with_tabulate(brown_trigram_fd, brown_bigram_fd, brown_unigram_fd)\nprint \" \" ", | |
"prompt_number": 321, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "RESULTS FOR BROWN\nTrigrams Bigrams Unigrams\n------------------------- ------------------ ----------\nmr. hawksley said new york said\nnew york city per cent mrs.\n10 per cent mr. mrs. would\nfour home runs united states new\nhome rule charter last week one\nnew york yankees last year last\n4 per cent white house two\naged care plan high school mr.\namerican catholic higher home runs first\ncatholic higher education u. s. state\nla dolce vita president kennedy president\nnational football league last night year\nper cent interest said would home\npotato chip industry san francisco also\ntwo years ago years ago made\n12 months ended anti-trust laws time\n60 home runs mr. kennedy years\nannapolis jan. 7 kansas city three\nanne arundel county premier khrushchev house\nannounce birth daughter los angeles week", | |
"stream": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "\n \n", | |
"stream": "stdout" | |
} | |
], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": " \n \nTechnique 2: PMI and Chi-squared Collocations\n" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "def find_colloc_pmi_chisq(words_cleaned_nopunc):\n bigram_measures = nltk.collocations.BigramAssocMeasures()\n trigram_measures = nltk.collocations.TrigramAssocMeasures()\n finder1 = BigramCollocationFinder.from_words(words_cleaned_nopunc)\n finder2 = TrigramCollocationFinder.from_words(words_cleaned_nopunc)\n finder1.apply_freq_filter(2)\n finder2.apply_freq_filter(2)\n bg_pmi = finder1.nbest(bigram_measures.pmi, 20)\n bg_chisq = finder1.nbest(bigram_measures.chi_sq, 20)\n tg_pmi = finder2.nbest(trigram_measures.pmi, 20)\n tg_chisq = finder2.nbest(trigram_measures.chi_sq, 20)\n return bg_pmi, bg_chisq, tg_pmi, tg_chisq\n\nmacbeth_bg_pmi, macbeth_bg_chisq, macbeth_tg_pmi, macbeth_tg_chisq = find_colloc_pmi_chisq(macbeth_words_cleaned_nopunc)\ncaesar_bg_pmi, caesar_bg_chisq, caesar_tg_pmi, caesar_tg_chisq = find_colloc_pmi_chisq(caesar_words_cleaned_nopunc)\nhamlet_bg_pmi, hamlet_bg_chisq, hamlet_tg_pmi, hamlet_tg_chisq = find_colloc_pmi_chisq(hamlet_words_cleaned_nopunc)\n\n", | |
"prompt_number": 322, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "def print_ngram_pmi_chisq_with_tabulate(tg_pmi, tg_chisq, bg_pmi, bg_chisq):\n list_for_printing = []\n for i in range(20):\n list_for_printing.append([\" \".join(tg_pmi[i]), \" \".join(tg_chisq[i]), \" \".join(bg_pmi[i]), \" \".join(bg_chisq[i])])\n print tabulate(list_for_printing, headers=[\"Trigram-PMI\", \"Trigram-ChiSq\", \"Bigram-PMI\", \"Bigram-ChiSq\"])", | |
"prompt_number": 323, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "print \"RESULTS FOR MACBETH\"\nprint_ngram_pmi_chisq_with_tabulate(macbeth_tg_pmi, macbeth_tg_chisq, macbeth_bg_pmi, macbeth_bg_chisq)\nprint \" \"\nprint \"RESULTS FOR CAESAR\"\nprint_ngram_pmi_chisq_with_tabulate(caesar_tg_pmi, caesar_tg_chisq, caesar_bg_pmi, caesar_bg_chisq)\nprint \" \"\nprint \"RESULTS FOR HAMLET\"\nprint_ngram_pmi_chisq_with_tabulate(hamlet_tg_pmi, hamlet_tg_chisq, hamlet_bg_pmi, hamlet_bg_chisq)\nprint \" \"", | |
"prompt_number": 324, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "RESULTS FOR MACBETH\nTrigram-PMI Trigram-ChiSq Bigram-PMI Bigram-ChiSq\n--------------------- --------------------- ----------------- -----------------\nburne cauldron bubble children shall kings musicke song musicke song\nfire burne cauldron wilt thou father prince cumberland prince cumberland\ntoyle trouble fire lesse deseru d thunder lightning ten thousand\ntrouble fire burne seyward ten thousand ten thousand dy de\nseyward ten thousand cauldron bubble 2 dy de y sey\ndouble toyle trouble toyle trouble fire y sey thunder lightning\ncauldron bubble 2 ring alarum bell drum colours burne cauldron\nring alarum bell till byrnane wood pallace gate cauldron bubble\ndouble double toyle byrnane wood come burne cauldron fire burne\ntill byrnane wood double toyle trouble cauldron bubble weyward sisters\nknock knock knock double double toyle fire burne wee l\nbyrnane wood come old man tis toyle trouble drum colours\ngod blesse vs macbeth haile thee weyward sisters pallace gate\nwilt thou father looke like th remoue meanes thane cawdor\nlesse deseru d ile doe ile england ireland toyle trouble\nwood come dunsinane trouble fire burne laugh scorne remoue meanes\nhaile king scotland fire burne cauldron ne re trouble fire\nthane cawdor liues burne cauldron bubble lesse deseru england ireland\nchildren shall kings god blesse vs sorry sight ne re\nappar macbeth macbeth appar macbeth macbeth weyard sisters laugh scorne\n \nRESULTS FOR CAESAR\nTrigram-PMI Trigram-ChiSq Bigram-PMI Bigram-ChiSq\n------------------------ ------------------------ ----------------- -----------------\ngoing dwell married ill temper d ciuill strife ciuill strife\ngraunt woman withall bid giue thee harme intended wee l\nbreefely wisely truly man directly breefely itching palme harme intended\nteare bad verses caesars funerall friend popillius lena itching palme\nwoman withall woman cicero one cicero dwell married popillius lena\nwhether going dwell euer euer farewell low alarums ne re\ndirectly breefely wisely ple plucke downe falling sicknesse ides march\nbeware ides march therein yee gods varrus claudio dwell married\nsonne marcus cato ride ride messala ne re low alarums\nmarcus cato hoe name whether going pompeyes porch falling sicknesse\ntherein yee gods marcus cato hoe therein yee varrus claudio\ndwell married man sonne marcus cato went schoole beware ides\nride ride messala directly breefely wisely graunt woman pompeyes porch\nname whether going whether going dwell breefely wisely went schoole\nple plucke downe breefely wisely truly liberty freedome therein yee\nspeake strike redresse woman withall woman beware ides market place\nsleep st awake graunt woman withall thunder lightning metellus cymber\nwee l burne going dwell married withall woman caius ligarius\nparting well made teare bad verses woman withall graunt woman\nman directly breefely dwell married man cinna poet breefely wisely\n \nRESULTS FOR HAMLET\nTrigram-PMI Trigram-ChiSq Bigram-PMI Bigram-ChiSq\n------------------------------ ------------------------------ --------------------- ---------------------\nmason shipwright carpenter neuer speake haue barbary horses barbary horses\ncomicall historicall pastorall inobled queene good mason shipwright mason shipwright\nclay made guest father much offended shipwright carpenter shipwright carpenter\nburied christian buriall o pit clay comicall historicall e ene\nmade guest meete ere go bed builds stronger comicall historicall\npit clay made vse art mad guest meete behinde arras\no pit clay pit clay made clowne sings ee n\ndye sleepe sleepe sweare sword sweare historicall pastorall closes consequence\nsweare sword sweare comicall historicall pastorall pit clay builds stronger\ngod buy ye clay made guest behinde arras guest meete\nvse art mad made guest meete ee n clowne sings\ngod blesse sir mason shipwright carpenter cocke crew historicall pastorall\nfather much offended buried christian buriall fro m pit clay\ndeere brothers death god buy ye heau n cocke crew\ninobled queene good dye sleepe sleepe nine yeare fro m\ndrown d drown deere brothers death closes consequence heau n\nere go bed king haue letters e ene nine yeare\nwee l put goe ile follow sixe french wee l\nsit downe let god blesse sir white snow christian buriall\ngoe ile follow charge thee speake powres poyson sixe french\n \n", | |
"stream": "stdout" | |
} | |
], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "brown_bg_pmi, brown_bg_chisq, brown_tg_pmi, brown_tg_chisq = find_colloc_pmi_chisq(brown_words_nopunc)\n\nprint \"RESULTS FOR BROWN\"\nprint_ngram_pmi_chisq_with_tabulate(brown_tg_pmi, brown_tg_chisq, brown_bg_pmi, brown_bg_chisq)\nprint \" \"", | |
"prompt_number": 325, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "RESULTS FOR BROWN\nTrigram-PMI Trigram-ChiSq Bigram-PMI Bigram-ChiSq\n----------------------------------- --------------------------------- -------------------- -------------------\nku klux klan prince souvanna phouma bateau neckline viet nam\npinar del rio jury room said cape cod hong kong\nhyannis port mass. three sisters mrs. corpus christi dolce vita\n58th precinct 23d washington moscow last del rio notre dame\nfort knox ky. despite mr. rusk's dimes quarters scottish rite\njudges 58th precinct w. buchheister president ethnic groupings duncan phyfe\nracial discrimination employment first atomic submarine hays kan. sterling township\nmakes cleaning drying seeing hit home hazards pedestrians bateau neckline\nelectronic data processing entertained luncheon home klux klan cape cod\ntraffic hazards pedestrians members book club ku klux corpus christi\nnotre dame chapter national christian family pinar del del rio\nprince souvanna phouma american friends service puerto rico dimes quarters\nstephanie shaw hillsboro big four summit scenic effects ethnic groupings\nla dolce vita foreign relations committee tanks artillery hays kan.\nprecinct 23d ward yards per game wasteful duplication hazards pedestrians\nuniversity's charter by-laws relations national democratic bake slowly klux klan\nendowments institutions established since august 1 chisholm trail ku klux\nguilty reckless driving national council jewish cleaning drying pinar del\nduncan phyfe furniture armed services committee clinton bowman puerto rico\nconvicted engaging conspiracy association university professors coral gables scenic effects\n \n", | |
"stream": "stdout" | |
} | |
], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "Technique 3: TF-IDF" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "# tf-idf of each word with respect to a document: (1 + log(tf)) * log(N/df)\n\n\ndef compute_idf_scores(collec_list, collec_set):\n doc_freq_counter = Counter()\n num_of_docs = len(collec_list)\n for word in collec_set:\n for i in range(len(collec_list)):\n if word in collec_list[i]:\n doc_freq_counter[word] += 1 \n idf_scores_dict = {} \n for word in doc_freq_counter:\n idf_scores_dict[word] = log ( float(num_of_docs) / float(doc_freq_counter[word]) ) \n return idf_scores_dict \n\ndef compute_tf_idf(document, idf_scores_dict):\n words_fd = nltk.FreqDist(document)\n tf_idf = Counter() #storing in a Counter() rather than dict so that i can add Counters and use most_common() later \n for word in words_fd:\n tf_idf[word] = (1.0 + log( float(words_fd[word]) )) * idf_scores_dict[word]\n return tf_idf\n\ndef print_tf_idf_with_tabulate(tfidf_list):\n list_for_printing = []\n for word, score in tfidf_list:\n list_for_printing.append([word, score])\n print tabulate(list_for_printing, headers=[\"Word\", \"TF-IDF Score\"])\n\n", | |
"prompt_number": 326, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "shakespeare_collection_list = [macbeth_words_cleaned_nopunc] + [caesar_words_cleaned_nopunc] + [hamlet_words_cleaned_nopunc]\nshakespeare_collection_set = set(macbeth_words_cleaned_nopunc + caesar_words_cleaned_nopunc + hamlet_words_cleaned_nopunc) \nshakespeare_idf_scores = compute_idf_scores(shakespeare_collection_list, shakespeare_collection_set)\n\nmacbeth_tf_idf = compute_tf_idf(macbeth_words_cleaned_nopunc, shakespeare_idf_scores)\ncaesar_tf_idf = compute_tf_idf(caesar_words_cleaned_nopunc, shakespeare_idf_scores)\nhamlet_tf_idf = compute_tf_idf(hamlet_words_cleaned_nopunc, shakespeare_idf_scores)\n\nprint \"RESULTS FOR MACBETH\"\nprint_tf_idf_with_tabulate(macbeth_tf_idf.most_common()[0:20])\nprint \" \"\nprint \"RESULTS FOR CAESAR\"\nprint_tf_idf_with_tabulate(caesar_tf_idf.most_common()[0:20])\nprint \" \"\nprint \"RESULTS FOR HAMLET\"\nprint_tf_idf_with_tabulate(hamlet_tf_idf.most_common()[0:20])\nprint \" \"\n", | |
"prompt_number": 327, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "RESULTS FOR MACBETH\nWord TF-IDF Score\n--------- --------------\nmacbeth 5.28066\nbanquo 4.75941\nthane 4.63491\ncawdor 4.44336\nscotland 3.82856\nduncan 3.62826\nmacduffe 3.62826\nfleans 3.38311\ndunsinane 3.38311\nmurth 3.23641\nmacduff 3.06706\npowre 3.06706\nmalcolme 3.06706\nenglish 3.06706\nthanes 2.86676\ncauldron 2.86676\nseyward 2.86676\ntitle 2.86676\ntoth 2.86676\nhorror 2.86676\n \nRESULTS FOR CAESAR\nWord TF-IDF Score\n--------- --------------\ncassius 5.85641\nantony 5.68465\ncaesars 5.06561\ncaes 4.79796\ncaska 4.678\ntitinius 4.38976\nmessala 4.33341\noctauius 4.33341\nromans 4.27401\nlucius 4.27401\ncapitoll 4.21122\ncaius 3.9165\nphilippi 3.73297\ncinna 3.73297\npindarus 3.73297\nlucillius 3.62826\nmetellus 3.62826\ncymber 3.51251\ndecius 3.51251\nportia 3.51251\n \nRESULTS FOR HAMLET\nWord TF-IDF Score\n------------- --------------\nhamlet 5.9532\nhoratio 4.87123\nlaertes 4.79796\nophelia 4.44336\ndenmarke 4.33341\ngertrude 3.82856\nverie 3.73297\nexcellent 3.73297\nfortinbras 3.73297\npyrrhus 3.62826\nguildensterne 3.62826\nphrase 3.51251\nlaw 3.23641\nhamlets 3.23641\ne 3.23641\nscull 3.06706\ndanish 3.06706\nrosincrance 3.06706\ndrown 3.06706\nmaid 3.06706", | |
"stream": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "\n \n", | |
"stream": "stdout" | |
} | |
], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "def create_collection_list(words_nopunc): \n #this is my little hack workaround to the fact that the Brown corpus isn't neatly divided into articles \n #i'm dividing up all words into \"documents\" of 5,000 words each \n collection_list = []\n for i in range(0,len(words_nopunc),5000):\n if i+5000 <= len(words_nopunc):\n collection_list.append(words_nopunc[i:i+5000])\n collection_list.append(words_nopunc[((len(words_nopunc)//5000)*5000):len(words_nopunc)])\n return collection_list \n\n\nbrown_collection_list = create_collection_list(brown_words_nopunc) \nbrown_collection_set = set(brown_words_nopunc) \nbrown_idf_scores = compute_idf_scores(brown_collection_list, brown_collection_set)\n\nbrown_tf_idf_list = []\nfor i in range(len(brown_collection_list)):\n doc_tfidf = compute_tf_idf(brown_collection_list[i], brown_idf_scores)\n brown_tf_idf_list.append(doc_tfidf)\n", | |
"prompt_number": 328, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "brown_tf_idf_total = Counter()\nfor counter in brown_tf_idf_list: #sum up all tf-idf scores for each word across collection\n brown_tf_idf_total = brown_tf_idf_total + counter\n\nprint \"RESULTS FOR BROWN\"\nprint_tf_idf_with_tabulate(brown_tf_idf_total.most_common()[0:20])\nprint \" \"", | |
"prompt_number": 329, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "RESULTS FOR BROWN\nWord TF-IDF Score\n--------- --------------\nmantle 13.7066\nlibrary 13.1159\nruns 12.4353\ndallas 12.3768\ngame 12.2506\nsenate 12.0787\nbaseball 12.0233\nlaos 12.0209\njury 11.9262\ngames 11.9193\npalmer 11.8697\ntax 11.8277\nyesterday 11.3573\nseason 11.2245\navenue 11.219\ncongo 10.9629\nteam 10.9073\nyankees 10.8819\ngeorgia 10.7556\nmaris 10.6027", | |
"stream": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "\n \n", | |
"stream": "stdout" | |
} | |
], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "Apply All Techniques on Mystery Text" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "f = open('mystery.txt','r')\nmystery = f.read()\nmystery_articles = mystery.split('\\n \\n\\n')\n\nmystery_words = nltk.word_tokenize(mystery)\nmystery_articles_tokenized = []\nfor article in mystery_articles:\n tokenized = nltk.word_tokenize(article)\n mystery_articles_tokenized.append(tokenized)\n\ndef modified_get_cleaned_words(unclean_words): #modifying get_cleaned_words() function from above. \n #That one was written to take sents as input. Here I have words as input.\n cleaned_words = []\n for i in range(len(unclean_words)):\n if len(unclean_words[i]) == 2 and not unclean_words[i][0].isalnum() and not unclean_words[i][1].isalnum(): #to get rid of weird double punctuations like \"''\" \n continue \n if unclean_words[i] not in string.punctuation and unclean_words[i].lower().strip(string.punctuation) not in stopwords.words('english'):\n cleaned_words.append(unclean_words[i].lower().strip(string.punctuation))\n return cleaned_words\n\nmystery_words_cleaned_nopunc = modified_get_cleaned_words(mystery_words) #clean mystery_words\nmystery_articles_cleaned_nopunc = [] #clean mystery_articles_tokenized \nfor article in mystery_articles_tokenized:\n mystery_articles_cleaned_nopunc.append(modified_get_cleaned_words(article))\n\n", | |
"prompt_number": 330, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "# Technique 1: Frequent N-Grams \nmystery_trigram_fd, mystery_bigram_fd, mystery_unigram_fd = get_ngrams_fds(mystery_words_cleaned_nopunc)\nprint \"RESULTS FOR MYSTERY\"\nprint_ngrams_with_tabulate(mystery_trigram_fd, mystery_bigram_fd, mystery_unigram_fd)\nprint \" \"", | |
"prompt_number": 331, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "RESULTS FOR MYSTERY\nTrigrams Bigrams Unigrams\n--------------------------- ---------------------- ----------\nmln tonnes vs mln tonnes said\nu.s agriculture department last month mln\nmln tonnes last mln dlrs pct\ntonnes last month billion dlrs tonnes\ntrade sources said sources said u.s\nlast month exports mln barrels dlrs\nweek ended march new york dollar\nagriculture department said last year last\ntonnes free market department said trade\necus per tonne bank japan would\nmln last month traders said oil\nlast month usda tonnes vs wheat\nmonth exports 1985/86 u.s agriculture japan\npct rise january pct sulphur year\nfree market barley dealers said yen\nlast month stocks week ended new\npct year ago agriculture department prices\nbank japan intervenes crude oil market\ndlrs 75 cts heating oil coffee\ndlrs fob gulf official said bank", | |
"stream": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": "\n \n", | |
"stream": "stdout" | |
} | |
], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "# Technique 2: PMI and Chi-squared Collocations\nmystery_bg_pmi, mystery_bg_chisq, mystery_tg_pmi, mystery_tg_chisq = find_colloc_pmi_chisq(mystery_words_cleaned_nopunc)\nprint \"RESULTS FOR MYSTERY\"\nprint_ngram_pmi_chisq_with_tabulate(mystery_tg_pmi, mystery_tg_chisq, mystery_bg_pmi, mystery_bg_chisq)\nprint \" \"", | |
"prompt_number": 332, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "RESULTS FOR MYSTERY\nTrigram-PMI Trigram-ChiSq Bigram-PMI Bigram-ChiSq\n------------------------------- --------------------------- ----------------- -----------------------\n274 264 groundnutseed undermine confidence future 04/09/87 03/09/87 buenos aires\nbarney harris upham 2,000/2,000 apr egypt 10.8 13.8 van horick\nentre rios misiones could undermine confidence 103 102 santa fe\nexpellers 103 102 fraternite matin said 20-30 5,000-7,000 cape spencer\nfabricated mica consultancy move said production 21.20 dlrs/barrel shearson lehman\nfed's 5-1/2 8-1/2 27 said stocks 21.9 25.8 dean witter\ngems castor sandalwood expected average said 22.2 23.4 merrill lynch\njuices purees pulp said production conditional 23.8 29.3 excluded countertrading\nlaying golden egg crop said generally 264 groundnutseed hrs edt\npath signalling imminent cts said raising 274 264 nihon keizai\npurees pulp tomato said movement since 323 313 1,500/1,700 13-20/4\nwen wei po said sunflower maize 36.0 cents/gallon 13-20/4 toledo/seaforth\n36.0 cents/gallon chemical said view g-7 43.9 48.8 2,000/2,000 apr\n54.5 dual purpose minister said movement 49.5 0.125 25-4/5-5 naantali/saudi\ncake expellers 103 inc reported said 5-1/2 8-1/2 4,000/3,000 20-30/4\ndepartment's counselor belgrade said farm issue 52-57 d.i bahia blanca\nfertile land insect ltd zurich said 54.5 dual burro creek\nhighly visible drama agent country said 83 mm daps 24-27/4\nkept players sidelined country said agent 93.90 94.49 days/8,000 13-15/4\nonwards 330 320 said import barriers asbestos fibre enquiries antwerp/libya\n \n", | |
"stream": "stdout" | |
} | |
], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "# Technique 3: TF-IDF\n\nmystery_idf_scores = compute_idf_scores(mystery_articles_cleaned_nopunc, set(mystery_words_cleaned_nopunc))\n\nmystery_tf_idf_list = []\nfor i in range(len(mystery_articles_cleaned_nopunc)):\n doc_tfidf = compute_tf_idf(mystery_articles_cleaned_nopunc[i], mystery_idf_scores)\n mystery_tf_idf_list.append(doc_tfidf)\n \nmystery_tf_idf_total = Counter()\nfor counter in mystery_tf_idf_list: \n mystery_tf_idf_total = mystery_tf_idf_total + counter\n\nprint \"RESULTS FOR MYSTERY\"\nprint_tf_idf_with_tabulate(mystery_tf_idf_total.most_common()[0:20])\nprint \" \"\n", | |
"prompt_number": 333, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "RESULTS FOR MYSTERY\nWord TF-IDF Score\n------- --------------\nmln 330.91\npct 314.53\ntonnes 314.158\ndollar 272.326\ndlrs 263.61\nu.s 252.779\nyen 249.776\noil 245.817\njapan 241.153\nwheat 239.961\ntrade 239.15\nwould 238.462\nlast 222.119\nprices 221.353\nmarket 220.56\nbank 217.115\ncoffee 216.79\nyear 216.131\nbillion 208.947\nnew 208.251\n \n", | |
"stream": "stdout" | |
} | |
], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
} | |
], | |
"metadata": {} | |
} | |
], | |
"metadata": { | |
"name": "", | |
"signature": "sha256:5ff3f7c28f62e728da97505ae9e526c0d0c09ea51a158fe033094a901e9e1a8d" | |
}, | |
"nbformat": 3 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment