Skip to content

Instantly share code, notes, and snippets.

@akhileshravi
Last active October 6, 2019 23:08
Show Gist options
  • Save akhileshravi/b97abc7b5464abcc51a63170dfe197d5 to your computer and use it in GitHub Desktop.
Save akhileshravi/b97abc7b5464abcc51a63170dfe197d5 to your computer and use it in GitHub Desktop.
Assignment2NLP_16110007_Akhilesh
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# Assignment_2: Akhilesh Ravi-16110007\n",
"#Using Jane Austen Novels for Language Model Generation\n",
"\n",
"# Importing all packages\n",
"import math\n",
"import numpy as np\n",
"from operator import itemgetter\n",
"from nltk import word_tokenize, sent_tokenize\n",
"from string import punctuation\n",
"from nltk.util import ngrams"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"from sklearn.model_selection import train_test_split "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Pre-Processing of the dataset"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"DATASET- Jane Austen Novels: The Complete Works of Jane Austen take one line, neglect empty line and then add it to line_set then lower down whole set(string)\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of Sentences = 35056\n",
"Number of Types = 14651\n",
"Number of Tokens = 914639\n",
"TTR= 0.01601834166266691\n"
]
}
],
"source": [
"with open(\"JaneAusten.txt\", 'r') as file: \n",
" lines = ['']\n",
" for line in (l.rstrip() for l in file):\n",
" if line != '' or lines[-1] != '\\n': \n",
" lines.append(line + '\\n')\n",
" text = \"\".join(lines)\n",
"text = text.lower() \n",
"\n",
"# Tokenizing sentences\n",
"# Vocabulary dictionary and corpus\n",
"vocabulary = {}\n",
"corpus = []\n",
"list_sentence = sent_tokenize(text)\n",
"# <s> is added to the start of each sentence\n",
"# </s> is added to the end of sentences\n",
"for sentence in list_sentence:\n",
" word_list = word_tokenize(sentence) # tokenizing each word of that sentence\n",
" final_sentence = ['<s>']\n",
" for word in word_list:\n",
" # Obtains the frequency of the words in the corpus\n",
" if len(set(word).intersection(punctuation)) == 0 and (len(word)>1 or word == 'a' or word == 'i') :\n",
" final_sentence.append(word)\n",
" if word not in vocabulary.keys(): \n",
" vocabulary[word] = 1\n",
" else:\n",
" vocabulary[word] += 1\n",
" \n",
" final_sentence += ['</s>']\n",
" corpus += [\" \".join(final_sentence)] #adding processed sentence to the corpus\n",
"\n",
"#Now Processed Corpus is ready \n",
"number_of_sentence = len(corpus)\n",
"# Vocabulary list doesn't contain the START and END word used in Pre-Processing\n",
"vocabulary['<s>'] = 2*number_of_sentence\n",
"vocabulary['</s>'] = 2*number_of_sentence\n",
"# Now we have Processed sentence corpus and vocabulary list. \n",
"#Processed Corpus DATA ANALYSIS \n",
"tokens = sum(vocabulary.values())\n",
"types = len(vocabulary)\n",
"print ('Number of Sentences = ' + str(number_of_sentence))\n",
"print ('Number of Types = ' + str(types))\n",
"print ('Number of Tokens = ' + str(tokens))\n",
"print (\"TTR= \" + str(types/tokens))\n",
"# train-test split in 80:20 ratio\n",
"train_data, test_data = train_test_split(corpus, test_size=0.20, random_state=64) \n",
" "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['<s> project gutenberg the complete works of jane austen by jane austen this ebook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever </s>',\n",
" '<s> you may copy it give it away or it under the terms of the project gutenberg license included with this ebook or online at title the complete project gutenberg works of jane austen author jane austen editor david widger release date january 25 2010 ebook 31100 language english character set encoding ascii start of this project gutenberg ebook the works of jane austen produced by many project gutenberg volunteers </s>',\n",
" '<s> the works of jane austen edited by david widger project gutenberg editions dedication this jane austen collection is dedicated to alice goodson hart woodby note the accompanying html file has active links to all the volumes and chapters in this set </s>',\n",
" '<s> contents persuasion northanger abbey mansfield park emma lady susan love and freindship and other early works pride and prejudice sense and sensibility persuasion by jane austen 1818 chapter sir walter elliot of kellynch hall in somersetshire was a man who for his own amusement never took up any book but the baronetage there he found occupation for an idle hour and consolation in a distressed one there his faculties were roused into admiration and respect by contemplating the limited remnant of the earliest patents there any unwelcome sensations arising from domestic affairs changed naturally into pity and contempt as he turned over the almost endless creations of the last century and there if every other leaf were powerless he could read his own history with an interest which never failed </s>',\n",
" '<s> this was the page at which the favourite volume always opened elliot of kellynch hall </s>',\n",
" '<s> walter elliot born march 1760 married july 15 1784 elizabeth daughter of james stevenson esq </s>',\n",
" '<s> of south park in the county of gloucester by which lady who died 1800 he has issue elizabeth born june 1785 anne born august 1787 a son november 1789 mary born november 20 1791 </s>',\n",
" '<s> precisely such had the paragraph originally stood from the hands but sir walter had improved it by adding for the information of himself and his family these words after the date of mary birth married december 16 1810 charles son and heir of charles musgrove esq </s>',\n",
" '<s> of uppercross in the county of somerset and by inserting most accurately the day of the month on which he had lost his wife </s>',\n",
" '<s> then followed the history and rise of the ancient and respectable family in the usual terms how it had been first settled in cheshire how mentioned in dugdale serving the office of high sheriff representing a borough in three successive parliaments exertions of loyalty and dignity of baronet in the first year of charles ii with all the marys and elizabeths they had married forming altogether two handsome duodecimo pages and concluding with the arms and motto principal seat kellynch hall in the county of somerset and sir walter handwriting again in this finale heir presumptive william walter elliot great grandson of the second sir walter </s>',\n",
" '<s> vanity was the beginning and the end of sir walter elliot character vanity of person and of situation </s>',\n",
" '<s> he had been remarkably handsome in his youth and at was still a very fine man </s>']"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"corpus[:12]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Q1a MLE for N-Grams : N= 1,2,3,4"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Unigrams\n",
"probability_unigram = {} \n",
"for i in vocabulary.keys():\n",
" probability_unigram[i] = (float(vocabulary[i]))/tokens \n",
"\n",
"\n",
"# Bigrams\n",
"\n",
"probability_bigram = {} # bigram probabilities\n",
"bigram_list = {} # biagram list\n",
"# from each sentence in corpus extracting all the biagrams\n",
"for sentence in corpus:\n",
" words_in_sentence = sentence.split()\n",
" for i in range(1,len(words_in_sentence)):\n",
" j = (words_in_sentence[i-1], words_in_sentence[i])#taking all consecutive combination\n",
" try:\n",
" bigram_list[j] += 1\n",
" except:\n",
" bigram_list[j] = 1\n",
" \n",
"for i in bigram_list.keys():\n",
" probability_bigram[i] = float(bigram_list[i])/vocabulary[i[0]]\n",
"\n",
"\n",
"# Trigrams\n",
"probability_trigram = {} #trigram probabilities\n",
"trigram_list = {}\n",
"\n",
"for sentence in corpus:\n",
" words_in_sentence = sentence.split()\n",
" for i in range(2,len(words_in_sentence)):\n",
" j = (words_in_sentence[i-2], words_in_sentence[i-1], words_in_sentence[i])\n",
" try:\n",
" trigram_list[j] += 1\n",
" except:\n",
" trigram_list[j] = 1\n",
"\n",
"\n",
"for i in trigram_list.keys():\n",
" probability_trigram[i] = (trigram_list[i])/bigram_list[(i[0], i[1])]\n",
"\n",
"# Quadrigrams\n",
"probability_quadgram = {}\n",
"quadgram_list = {}\n",
"\n",
"for sentence in corpus:\n",
" words_in_sentence = sentence.split()\n",
" for i in range(3,len(words_in_sentence)):\n",
" j = (words_in_sentence[i-3], words_in_sentence[i-2], words_in_sentence[i-1], words_in_sentence[i])\n",
" try:\n",
" quadgram_list[j] += 1\n",
" except:\n",
" quadgram_list[j] = 1\n",
"\n",
"#NOw we have list a histogram of QUADgram in the corpus \n",
"for i in quadgram_list.keys():\n",
" probability_quadgram[i] = (quadgram_list[i]*1.0)/trigram_list[(i[0], i[1], i[2])]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"UNIGRAM-MLE - Top 15\n",
"\n",
"('<s>', 0.07665537988211743)\n",
"('</s>', 0.07665537988211743)\n",
"('the', 0.03128994062138177)\n",
"('to', 0.028454942332439354)\n",
"('and', 0.02644321967464759)\n",
"('of', 0.025319278972359586)\n",
"('a', 0.015726423211780822)\n",
"('her', 0.015204905979298937)\n",
"('i', 0.015052933452433146)\n",
"('in', 0.013320009315150568)\n",
"('was', 0.012856438441833335)\n",
"('it', 0.0118308972173721)\n",
"('she', 0.011605671745901935)\n",
"('not', 0.010653383466045073)\n",
"('that', 0.009639868844429331)\n",
"\n",
"BIGRAM-MLE - Top 15\n",
"\n",
"(('whatsoever', '</s>'), 1.0)\n",
"(('david', 'widger'), 1.0)\n",
"(('2010', 'ebook'), 1.0)\n",
"(('31100', 'language'), 1.0)\n",
"(('encoding', 'ascii'), 1.0)\n",
"(('edited', 'by'), 1.0)\n",
"(('dedication', 'this'), 1.0)\n",
"(('goodson', 'hart'), 1.0)\n",
"(('woodby', 'note'), 1.0)\n",
"(('html', 'file'), 1.0)\n",
"(('patents', 'there'), 1.0)\n",
"(('1760', 'married'), 1.0)\n",
"(('1784', 'elizabeth'), 1.0)\n",
"(('stevenson', 'esq'), 1.0)\n",
"(('esq', '</s>'), 1.0)\n",
"\n",
"TRIGRAM-MLE - Top 15\n",
"\n",
"(('gutenberg', 'the', 'complete'), 1.0)\n",
"(('complete', 'works', 'of'), 1.0)\n",
"(('austen', 'by', 'jane'), 1.0)\n",
"(('ebook', 'is', 'for'), 1.0)\n",
"(('anyone', 'anywhere', 'at'), 1.0)\n",
"(('anywhere', 'at', 'no'), 1.0)\n",
"(('no', 'cost', 'and'), 1.0)\n",
"(('almost', 'no', 'restrictions'), 1.0)\n",
"(('no', 'restrictions', 'whatsoever'), 1.0)\n",
"(('restrictions', 'whatsoever', '</s>'), 1.0)\n",
"(('may', 'copy', 'it'), 1.0)\n",
"(('copy', 'it', 'give'), 1.0)\n",
"(('license', 'included', 'with'), 1.0)\n",
"(('included', 'with', 'this'), 1.0)\n",
"(('ebook', 'or', 'online'), 1.0)\n",
"\n",
"QUADGRAM-MLE - Top 15\n",
"\n",
"(('project', 'gutenberg', 'the', 'complete'), 1.0)\n",
"(('gutenberg', 'the', 'complete', 'works'), 1.0)\n",
"(('the', 'complete', 'works', 'of'), 1.0)\n",
"(('complete', 'works', 'of', 'jane'), 1.0)\n",
"(('works', 'of', 'jane', 'austen'), 1.0)\n",
"(('jane', 'austen', 'by', 'jane'), 1.0)\n",
"(('austen', 'by', 'jane', 'austen'), 1.0)\n",
"(('austen', 'this', 'ebook', 'is'), 1.0)\n",
"(('this', 'ebook', 'is', 'for'), 1.0)\n",
"(('ebook', 'is', 'for', 'the'), 1.0)\n",
"(('is', 'for', 'the', 'use'), 1.0)\n",
"(('for', 'the', 'use', 'of'), 1.0)\n",
"(('use', 'of', 'anyone', 'anywhere'), 1.0)\n",
"(('of', 'anyone', 'anywhere', 'at'), 1.0)\n",
"(('anyone', 'anywhere', 'at', 'no'), 1.0)\n"
]
}
],
"source": [
"print(\"UNIGRAM-MLE - Top 15\\n\")\n",
"for i in sorted(probability_unigram.items(), key=lambda x:x[1], reverse=True)[:15]:\n",
" print(i)\n",
" \n",
"print(\"\\nBIGRAM-MLE - Top 15\\n\")\n",
"for i in sorted(probability_bigram.items(), key=lambda x:x[1], reverse=True)[:15]:\n",
" print(i)\n",
" \n",
"print(\"\\nTRIGRAM-MLE - Top 15\\n\")\n",
"for i in sorted(probability_trigram.items(), key=lambda x:x[1], reverse=True)[:15]:\n",
" print(i)\n",
"\n",
"print(\"\\nQUADGRAM-MLE - Top 15\\n\")\n",
"for i in sorted(probability_quadgram.items(), key=lambda x:x[1], reverse=True)[:15]:\n",
" print(i)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Q1a"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"N: 1\n",
"Number of possible n-grams: 14651\n",
"Number of present n-grams: 14651\n",
"\n",
"N: 2\n",
"Number of possible n-grams: 214651801\n",
"Number of present n-grams: 213027\n",
"\n",
"N: 3\n",
"Number of possible n-grams: 3144863536451\n",
"Number of present n-grams: 527634\n",
"\n",
"N: 4\n",
"Number of possible n-grams: 46075395672543601\n",
"Number of present n-grams: 672052\n",
"\n"
]
}
],
"source": [
"\n",
"def NGrams(N):\n",
" possible = types**N\n",
" NGram_list = []\n",
" for sentence in corpus:\n",
" lst = list(ngrams(sentence.split(), N))\n",
" NGram_list += lst\n",
" unique_NGrams = set(NGram_list)\n",
" present = len(unique_NGrams)\n",
" return possible, present\n",
"\n",
"for i in range(1,5):\n",
" print('N:',i)\n",
" possible, present = NGrams(i)\n",
" print (\"Number of possible n-grams: \" + str(possible))\n",
" print (\"Number of present n-grams: \" + str(present))\n",
" print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Q1b"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"#Generator(type_of_model) for sentence generation \n",
"\n",
"# Sampling Function from the Multinomial is used\n",
"\n",
"def Generator(model_ngram):\n",
" sentence = \"<s>\"\n",
" start = \"<s>\"\n",
" if model_ngram==\"Unigram\":\n",
" word_probability = list(probability_unigram.values())\n",
" word_list = list(probability_unigram.keys())\n",
" while(True):\n",
" lst = np.random.multinomial(1,word_probability)\n",
" word = word_list[int(np.where(lst == 1)[0])]\n",
" if (word==\"<s>\"):\n",
" continue\n",
" elif (word == \"</s>\"):\n",
" sentence += word\n",
" print (sentence)\n",
" break\n",
" else:\n",
" sentence += word + \" \"\n",
" \n",
" elif model_ngram==\"Bigram\":\n",
" while(True):\n",
" probabilities, next_word = [], []\n",
" for i in probability_bigram.keys():\n",
" if(i[0]==start):\n",
" probabilities.append(probability_bigram[i])\n",
" next_word.append(i[1])\n",
" # ValueError: sum(pvals[:-1]) > 1.0 may occur\n",
" # This is because the precision of the numbers in Python are not exact\n",
" # Please Change your system to 32-bit and run it again. Or run this part by part\n",
" lst = (np.random.multinomial(1.0,probabilities))\n",
" word = next_word[int(np.where(lst == 1)[0])]\n",
" if word == \"</s>\":\n",
" sentence += word\n",
" print (sentence)\n",
" break\n",
" else:\n",
" start = word\n",
" sentence += word + \" \"\n",
" \n",
" elif model_ngram==\"Trigram\":\n",
" probabilities, next_word = [], []\n",
" for i in probability_bigram.keys():\n",
" if(i[0]==start):\n",
" probabilities.append(probability_bigram[i])\n",
" next_word.append(i[1]) \n",
" \n",
" lst = np.random.multinomial(1,probabilities)\n",
" word = next_word[int(np.where(lst==1)[0])]\n",
" bigTuple = (start,word)\n",
" sentence += word + \" \"\n",
" \n",
" while(True):\n",
" next_wordT, probabilitiesT = [], []\n",
" for i in probability_trigram.keys():\n",
" if(i[0]==bigTuple[0] and i[1] == bigTuple[1]):\n",
" probabilitiesT.append(probability_trigram[i])\n",
" next_wordT.append(i[2])\n",
" \n",
" \n",
" lstT = np.random.multinomial(1,probabilitiesT)\n",
" wordT = next_wordT[int(np.where(lstT==1)[0])]\n",
" bigTuple = (bigTuple[1],wordT)\n",
" \n",
" if wordT == \"</s>\":\n",
" sentence += wordT\n",
" print (sentence)\n",
" break\n",
" else:\n",
" sentence += wordT + \" \"\n",
" \n",
" \n",
" elif model_ngram == \"Quadgram\":\n",
" next_wordB, probabilitiesB = [], []\n",
" for i in probability_bigram.keys():\n",
" if(i[0]==start):\n",
" probabilitiesB.append(probability_bigram[i])\n",
" next_wordB.append(i[1]) \n",
" \n",
" lstB = np.random.multinomial(1,probabilitiesB)\n",
" wordB = next_wordB[int(np.where(lstB==1)[0])]\n",
" bigTuple = (start,wordB)\n",
" sentence += wordB + \" \"\n",
" \n",
" next_wordT, probabilitiesT =[], []\n",
" for i in probability_trigram.keys():\n",
" if(i[0] == bigTuple[0] and i[1]== bigTuple[1]):\n",
" next_wordT.append(i[2])\n",
" probabilitiesT.append(probability_trigram[i])\n",
" \n",
" lstT = np.random.multinomial(1,probabilitiesT)\n",
" wordT = next_wordT[int(np.where(lstT==1)[0])]\n",
" trigTuple = (start,wordB,wordT)\n",
" sentence += wordT + \" \"\n",
" \n",
" while(True):\n",
" next_wordQ, probabilitiesQ = [], []\n",
" for i in probability_quadgram.keys():\n",
" if(i[0]==trigTuple[0] and i[1] == trigTuple[1] and i[2]==trigTuple[2]):\n",
" probabilitiesQ.append(probability_quadgram[i])\n",
" next_wordQ.append(i[3])\n",
" \n",
" \n",
" lstQ = np.random.multinomial(1,probabilitiesQ)\n",
" wordQ = next_wordQ[int(np.where(lstQ==1)[0])]\n",
" trigTuple = (trigTuple[1],trigTuple[2],wordQ) \n",
"\n",
"\n",
" if wordQ == \"</s>\":\n",
" sentence += wordQ\n",
" print (sentence)\n",
" break\n",
" else:\n",
" sentence += wordQ + \" \""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Q1c - Evaluation"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Perplexity for Add-1 smoothing NGram Language Model= 656.2864111198207\n"
]
}
],
"source": [
"# PERPLEXITY\n",
"# Code For Calculating PERPLEXITY\n",
"# Bigram model\n",
"# Uses Add-one smoothing\n",
"\n",
"bigram_List_Add1, bigram_Add1 = {}, {}\n",
"change = {} # delta change in the count\n",
"for i in bigram_list:\n",
" bigram_Add1[i] = float(bigram_list[i] + 1)/(vocabulary[i[0]] + types)\n",
" bigram_List_Add1[i] = bigram_Add1[i]*vocabulary[i[0]]\n",
" change[i] = bigram_list[i] - bigram_List_Add1[i]\n",
"\n",
" \n",
"def Bigrams_Test():\n",
" ngrams_list = []\n",
" for s in test_data:\n",
" ngrams_list += list(ngrams(s.split(\" \"), 2))\n",
" return ngrams_list\n",
"bigrams_Test = Bigrams_Test()\n",
"\n",
"test_size = 0\n",
"for sentence in test_data:\n",
" test_size += len(sentence.split(\" \"))\n",
"\n",
"perplexity_Add1 = 1.0\n",
"for k in bigrams_Test:\n",
" if(k not in bigram_Add1):\n",
" perplexity_Add1 *= (len(unigrams) ** (1.0/test_size))\n",
" else:\n",
" perplexity_Add1 *= ((1.0/bigram_Add1[k]) ** (1.0/test_size))\n",
" \n",
"\n",
"print (\"Perplexity for Add-1 smoothing NGram Language Model=\", str(perplexity_Add1))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Generating 5 sentences"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" UNIGRAM SENTENCES \n",
"\n",
"<s>not the may such not for nobody that all </s>\n",
"<s></s>\n",
"<s>she know and at of be have fine certainly struck when i rosings as their angry whose in about of whose your him flow were that great but i see by so and there question her than be gate </s>\n",
"<s>been her ready </s>\n",
"<s>observing </s>\n",
"\n",
" Readibility is very bad for UNIGRAMS, the sentences are completely random \n",
"\n",
"\n",
" BIGRAM SENTENCES\n",
"\n",
"<s>professor michael hart </s>\n",
"<s>well as she could make me out of communication in want to meet them bingley called on a little while they were sent off as she saw him marry and no disservice with all things but when frank churchill had returned home was not have been with them and given for me that it so full of something more than i would convey she was up these in conduit street especially as far behind the winter and expense of having an engagement to assure you had done </s>\n",
"<s>well at her more to get all and i used to cole and glad of sisterly partiality provoked her commendation at his goodness does it to see she felt the were made him she was proceeding </s>\n",
"<s>professor michael hart woodby note sent into her well as idle assertions valueless you seem strong habit of disquiet and a doubtful good to the most attentive to hear she would have seized the musgroves she is to be of motives his opinion nature it be attempted </s>\n",
"<s>professor michael hart </s>\n",
"\n",
" Readibility is poor for BIGRAMS (some context is there) \n",
"\n",
"\n",
" TRIGRAM SENTENCES\n",
"\n",
"<s>professor michael hart is the less awful character of his family from whom she could hardly have borne any thing rather than her friend </s>\n",
"<s>professor michael hart the owner he had just entered and an allusion to his purpose by a sudden scheme </s>\n",
"<s>this was all the insolence of ferrars heart and that you should think it will be a very good house almost as many people he did or could love which i understand you perfectly right </s>\n",
"<s>professor michael hart is the company beneath her </s>\n",
"<s>i have interrupted her by hens forsaking their nests or being stolen by a request from thorpe to good nature from any such design in the water and she could not bear writing you know that in lydia marriage been concluded that a persuasion which of course </s>\n",
"\n",
" Readibility is better for TRIGRAMS \n",
"\n",
"\n",
" QUADGRAM SENTENCES\n",
"\n",
"<s>you forget yourself you take me for my impertinence and half afraid of being in love with a proper object </s>\n",
"<s>professor michael hart is the originator of the project license for all works posted with the permission of the copyright holder found at the beginning seemed to occupy her in most unmirthful reflections some time longer in the same inn at the same time there was no reason to complain and was amused enough quite enough still to stand at the door to welcome her and in her first bloom mr elliot had better not meet darcy that to be making such a purchase in his absence and wednesday she was very often influenced rightly by you oftener than i would own at the time </s>\n",
"<s>professor michael hart is the originator of the project gutenberg literary archive foundation how to help produce our new ebooks and how to endeavour to please as he certainly did while he really belonged to another how could he answer it to himself to be relieved from her therefore was so great that mary and anne smiling and blushing very becomingly shewed to mr elliot </s>\n",
"<s>weston had walked in </s>\n",
"<s>they were laying themselves open to that very phrase and to having it sent off in a letter which she had almost as long been ashamed of it </s>\n",
"\n",
" Readibility is fine for QUADGRAMS \n",
"\n"
]
}
],
"source": [
"\n",
"\n",
"print (\" UNIGRAM SENTENCES \\n\")\n",
"for z in range(5):\n",
" Generator(\"Unigram\") \n",
"print (\"\\n Readibility is very bad for UNIGRAMS, the sentences are completely random \\n\\n\")\n",
" \n",
" \n",
"print (\" BIGRAM SENTENCES\\n\")\n",
"for z in range(5):\n",
" Generator(\"Bigram\")\n",
"print (\"\\n Readibility is poor for BIGRAMS (some context is there) \\n\\n\")\n",
"\n",
"print (\" TRIGRAM SENTENCES\\n\")\n",
"for z in range(5):\n",
" Generator(\"Trigram\")\n",
"print (\"\\n Readibility is better for TRIGRAMS \\n\\n\")\n",
"\n",
"print (\" QUADGRAM SENTENCES\\n\")\n",
"for z in range(5):\n",
" Generator(\"Quadgram\")\n",
"print (\"\\n Readibility is fine for QUADGRAMS \\n\")\n",
" "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment