Skip to content

Instantly share code, notes, and snippets.

@akhileshravi
Last active December 26, 2019 19:33
Show Gist options
  • Save akhileshravi/d032d227aa5a553fadccd4679b74a0de to your computer and use it in GitHub Desktop.
Save akhileshravi/d032d227aa5a553fadccd4679b74a0de to your computer and use it in GitHub Desktop.
Assignment1_NLP_16110007
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ASSIGNMENT 1\n",
"NLP\n",
"Akhilesh Ravi\n",
"16110007"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import re\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"from bs4 import BeautifulSoup"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv('tweets-dataset.csv')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Sentence</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>@BubblyDentist @MeetUunngLee nahi nahi, mere s...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>going to the grammys first entertainment law b...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>bohot hi badiya ji aap sunao?</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Parvez Musharraf is Digvijay Singh of Pakistan...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>guddu ko bass john cena k sticker ki padii hai...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Aman ki maa ki... Asha https://twitter.com/ash...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>pakistan can wait more more and more . . . ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>@sagarcasm Jai Mahesh !!</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>RT AAP_Najafgarh \" RT AAPInNews : When DDCA lo...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>\"Kaam ho jayega, thoda kharcha paani lagega\" \\...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Sentence\n",
"0 @BubblyDentist @MeetUunngLee nahi nahi, mere s...\n",
"1 going to the grammys first entertainment law b...\n",
"2 bohot hi badiya ji aap sunao?\n",
"3 Parvez Musharraf is Digvijay Singh of Pakistan...\n",
"4 guddu ko bass john cena k sticker ki padii hai...\n",
"5 Aman ki maa ki... Asha https://twitter.com/ash...\n",
"6 pakistan can wait more more and more . . . ...\n",
"7 @sagarcasm Jai Mahesh !!\n",
"8 RT AAP_Najafgarh \" RT AAPInNews : When DDCA lo...\n",
"9 \"Kaam ho jayega, thoda kharcha paani lagega\" \\..."
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[:10]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# s = re.sub('[^0-9a-zA-Z]+', '*', s)\n",
"# https://stackoverflow.com/questions/12985456/replace-all-non-alphanumeric-characters-in-a-string"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def cleanText(raw_text):\n",
" '''\n",
" Convert a raw review to a cleaned review\n",
" '''\n",
" text = BeautifulSoup(raw_text, 'html').get_text() #remove html\n",
" words = text.split()\n",
" words = [w for w in words if '@' not in w and '#' not in w] # remove the @-words and #-words\n",
" text = ' '.join(words)\n",
" letters_only = re.sub('[^a-zA-Z]+', ' ', text) # remove non-character\n",
" \n",
" return( letters_only.lower())\n",
"\n",
"vclean = np.vectorize(cleanText)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Sentence</th>\n",
" <th>Cleaned sentence</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>@BubblyDentist @MeetUunngLee nahi nahi, mere s...</td>\n",
" <td>nahi nahi mere saath jaakar pachtaogi ye uunng...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>going to the grammys first entertainment law b...</td>\n",
" <td>going to the grammys first entertainment law b...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>bohot hi badiya ji aap sunao?</td>\n",
" <td>bohot hi badiya ji aap sunao</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Parvez Musharraf is Digvijay Singh of Pakistan...</td>\n",
" <td>parvez musharraf is digvijay singh of pakistan...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>guddu ko bass john cena k sticker ki padii hai...</td>\n",
" <td>guddu ko bass john cena k sticker ki padii hai...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Aman ki maa ki... Asha https://twitter.com/ash...</td>\n",
" <td>aman ki maa ki asha https twitter com ashabhos...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>pakistan can wait more more and more . . . ...</td>\n",
" <td>pakistan can wait more more and more aakhir pa...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>@sagarcasm Jai Mahesh !!</td>\n",
" <td>jai mahesh</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>RT AAP_Najafgarh \" RT AAPInNews : When DDCA lo...</td>\n",
" <td>rt aap najafgarh rt aapinnews when ddca lowere...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>\"Kaam ho jayega, thoda kharcha paani lagega\" \\...</td>\n",
" <td>kaam ho jayega thoda kharcha paani lagega sir...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Sentence \\\n",
"0 @BubblyDentist @MeetUunngLee nahi nahi, mere s... \n",
"1 going to the grammys first entertainment law b... \n",
"2 bohot hi badiya ji aap sunao? \n",
"3 Parvez Musharraf is Digvijay Singh of Pakistan... \n",
"4 guddu ko bass john cena k sticker ki padii hai... \n",
"5 Aman ki maa ki... Asha https://twitter.com/ash... \n",
"6 pakistan can wait more more and more . . . ... \n",
"7 @sagarcasm Jai Mahesh !! \n",
"8 RT AAP_Najafgarh \" RT AAPInNews : When DDCA lo... \n",
"9 \"Kaam ho jayega, thoda kharcha paani lagega\" \\... \n",
"\n",
" Cleaned sentence \n",
"0 nahi nahi mere saath jaakar pachtaogi ye uunng... \n",
"1 going to the grammys first entertainment law b... \n",
"2 bohot hi badiya ji aap sunao \n",
"3 parvez musharraf is digvijay singh of pakistan... \n",
"4 guddu ko bass john cena k sticker ki padii hai... \n",
"5 aman ki maa ki asha https twitter com ashabhos... \n",
"6 pakistan can wait more more and more aakhir pa... \n",
"7 jai mahesh \n",
"8 rt aap najafgarh rt aapinnews when ddca lowere... \n",
"9 kaam ho jayega thoda kharcha paani lagega sir... "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data['Cleaned sentence'] = vclean(data['Sentence'])\n",
"data[:10]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 nahi nahi mere saath jaakar pachtaogi ye uunng...\n",
"1 going to the grammys first entertainment law b...\n",
"2 bohot hi badiya ji aap sunao \n",
"3 parvez musharraf is digvijay singh of pakistan...\n",
"4 guddu ko bass john cena k sticker ki padii hai...\n",
"5 aman ki maa ki asha https twitter com ashabhos...\n",
"6 pakistan can wait more more and more aakhir pa...\n",
"7 jai mahesh \n",
"8 rt aap najafgarh rt aapinnews when ddca lowere...\n",
"9 kaam ho jayega thoda kharcha paani lagega sir...\n",
"Name: Cleaned sentence, dtype: object"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[:10]['Cleaned sentence']"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def tokenize(s):\n",
" return tuple(s.split())"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of tokens: 303161\n",
"Number of word types: 32707\n"
]
}
],
"source": [
"tokens = []\n",
"for i in range(len(data)):\n",
" tokens.extend( tokenize( data.iloc[i]['Cleaned sentence']) )\n",
"sorted_tokens = sorted(tokens)\n",
"word_types = list(set(tokens))\n",
"print('Number of tokens:', len(tokens))\n",
"print('Number of word types:', len(word_types))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TTR: 0.10788656852299604\n"
]
}
],
"source": [
"print('TTR:', len(word_types)/len(tokens))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2. Zipf's Law"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"token_count = {}\n",
"for s in sorted_tokens:\n",
" if s in token_count:\n",
" token_count[s] += 1\n",
" else:\n",
" token_count[s] = 1"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Top 50 Word types in decreasing order of number of occurences:\n",
"\n"
]
},
{
"data": {
"text/plain": [
"[('hai', 10030),\n",
" ('to', 4154),\n",
" ('ki', 3224),\n",
" ('ke', 3170),\n",
" ('nahi', 3169),\n",
" ('bhi', 2929),\n",
" ('the', 2866),\n",
" ('se', 2601),\n",
" ('ho', 2365),\n",
" ('ka', 2310),\n",
" ('bhai', 2266),\n",
" ('ko', 2208),\n",
" ('me', 1955),\n",
" ('ye', 1869),\n",
" ('kya', 1815),\n",
" ('hi', 1801),\n",
" ('aur', 1797),\n",
" ('twitter', 1760),\n",
" ('com', 1724),\n",
" ('kar', 1681),\n",
" ('i', 1509),\n",
" ('in', 1387),\n",
" ('t', 1319),\n",
" ('https', 1310),\n",
" ('is', 1296),\n",
" ('mein', 1276),\n",
" ('a', 1202),\n",
" ('ek', 1165),\n",
" ('and', 1126),\n",
" ('status', 1108),\n",
" ('of', 1074),\n",
" ('on', 1071),\n",
" ('na', 1026),\n",
" ('s', 1009),\n",
" ('ab', 969),\n",
" ('toh', 963),\n",
" ('rt', 944),\n",
" ('tha', 937),\n",
" ('http', 905),\n",
" ('for', 885),\n",
" ('you', 885),\n",
" ('aaj', 873),\n",
" ('co', 872),\n",
" ('raha', 868),\n",
" ('par', 826),\n",
" ('ne', 824),\n",
" ('aap', 820),\n",
" ('hain', 816),\n",
" ('koi', 802),\n",
" ('kuch', 801),\n",
" ('liye', 780),\n",
" ('k', 754),\n",
" ('tu', 748),\n",
" ('ji', 747),\n",
" ('it', 704),\n",
" ('p', 702),\n",
" ('sir', 691),\n",
" ('d', 690),\n",
" ('do', 684),\n",
" ('pe', 667),\n",
" ('main', 665),\n",
" ('mujhe', 643),\n",
" ('gaya', 631),\n",
" ('rahe', 621),\n",
" ('h', 619),\n",
" ('baat', 600),\n",
" ('be', 598),\n",
" ('sab', 586),\n",
" ('with', 570),\n",
" ('at', 566),\n",
" ('he', 557),\n",
" ('aa', 554),\n",
" ('de', 536),\n",
" ('url', 532),\n",
" ('jo', 531),\n",
" ('yaar', 530),\n",
" ('kiya', 508),\n",
" ('hum', 499),\n",
" ('hota', 487),\n",
" ('le', 479),\n",
" ('tum', 475),\n",
" ('mere', 468),\n",
" ('this', 454),\n",
" ('diya', 452),\n",
" ('modi', 449),\n",
" ('log', 448),\n",
" ('ya', 425),\n",
" ('my', 424),\n",
" ('tomorrow', 424),\n",
" ('that', 419),\n",
" ('gaye', 415),\n",
" ('bas', 413),\n",
" ('din', 412),\n",
" ('hu', 410),\n",
" ('kabhi', 410),\n",
" ('abhi', 405),\n",
" ('u', 394),\n",
" ('india', 388),\n",
" ('day', 387),\n",
" ('time', 385)]"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted_token_count = list(token_count.items())\n",
"sorted_token_count.sort(key=lambda x:x[1], reverse=True)\n",
"print('Top 50 Word types in decreasing order of number of occurences:\\n')\n",
"sorted_token_count[:100]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Text(0.5, 1.0, \"Zipf's Law\")"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAEWCAYAAABMoxE0AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3deZhcdZ3v8fcnTQeazWbpyUAHSMTcMCBCsGURFxYliCJ5GFBcxoA43OfKdZ8oudd5wqKCExVQR53IrgyrERHQGAGRywyBxLAFCGRQSJpAIiQgmsEs3/vH+RWp7nR3ne7T1dWn+/N6nnqqzq/OqfM9RZFv/85vU0RgZmY2UGMaHYCZmZWbE4mZmRXiRGJmZoU4kZiZWSFOJGZmVogTiZmZFeJEYjYAkn4haXrOfcdJ+q2kP0n6Zr1jMxtqWzU6ALPhRtJHgH/r4a3tgFkRcW5EvKcfH3kG8Edgx4gISWcDRMTZ/YwrgEkRsaw/x5nVm2skZt1ExNURsX31A/gs8DzwwwF85F7Ao+HRvzZCOZGY1SBpCnARcEpErExlv5H0ifT6VEn3SPqupJckPS7p6PTeFcB04IuSXpH0rm6fvaukWyStlfSipLsl9ev/S0l7S7pD0guS/ijpakmt6b3TJP28at8nJd1Qtb1c0oED+mLMEt/aMutD+gf5RuC8iPhNH7sekvbbFTgRmCtpYkScKglgRUR8Oe3766rjvgCsANrS9qFAf2suAs4HfgvsCPwEOJusFnUXcGFKTn8LjAUOS9f2emB74KF+ns+sC9dIzHqhLANcBTwC/EuN3VcBF0XE+oi4DlgKvDfHadYDuwF7pWPv7u8tsIhYFhHzI+LViFgNfAt4Z3rvKeBPwIHAO4B5wLOS9kn73B0Rm/pzPrPuXCMx692XgP2AN+f4x72z2z5PA7vnOMdsstrDr1LNZU5EXNCfICWNAy4G3g7sQPYH4pqqXe4CjgDekF6vJUsih6Vts0JcIzHrgaQjgP8LnBQRa3Mc0p5qMBV7As/WOigi/hQRX4iI1wPvBz5faV/ph6+R3Q7bPyJ2BD5KdruropJI3p5e30WWSN6JE4kNAicSs24k7QZcC3w2IhbnPOxvgE9LapZ0MvB3wG05zvU+SW9ISeglYCPQ162msZK2qXo0kdVCXgFektQOzOh2zF3AkUBLRKwA7gaOBXYB8l6fWa+cSMy29I/AOODi1NOq+vGDXo5ZAEwiGy/yVbKazAs5zjWJrPH9FeA/ge9FxJ197L8EWFf1OA04BziILBHdCsytPiAinkiff3fafhl4CrgnIjbmiNGsT3LXdrNiJJ0KfCIi3tboWMwawTUSMzMrxInEzMwK8a0tMzMrxDUSMzMrZNQNSNx1111jwoQJjQ7DzKxUFi1a9MeIaOvpvVGXSCZMmMDChQsbHYaZWalIerq393xry8zMCnEiMTOzQpxIzMysECcSMzMrxInEzMwKGXW9turppsWdzJ63lGfXrmP31hZmTJ3MtCntjQ7LzKyu6lojkXSZpFWSHqkq21nS/LR29HxJO6VySfq2pGWSHpJ0UNUx09P+T0qaXlX+ZkkPp2O+3W09iCF10+JOZs59mM616wigc+06Zs59mJsWdzYqJDOzIVHvW1tXkK17UO0s4PaImATcnrYB3kM2pfYk4Azg+5AlHmAW2ZrYBwOzKskn7fOPVcd1P9eQmT1vKevWd52Re936jcyet7RBEZmZDY26JpKI+C3wYrfiE4Ar0+srgWlV5VdF5l6gNS0wNBWYHxEvRsQaYD5wbHpvx4i4Ny1xelXVZw25Z9eu61e5mdlI0YjG9nERsTK9fo5sASGAdmB51X4rUllf5St6KN+CpDMkLZS0cPXq1cWvoAe7t7b0q9zMbKRoaK+tVJOo+/TDETEnIjoioqOtrcepYgqbMXUyLc1NXcpampuYMXVyXc5nZjZcNCKRPJ9uS1XWxl6VyjuBPar2G5/K+iof30N5Q0yb0s75J+5Pe2sLAtpbWzj/xP3da8vMRrxGdP+9GZgOXJCef1ZV/r8lXUvWsP5SRKyUNA/4WlUD+zHAzIh4UdLLkg4lWy/7Y8B3hvJCups2pd2Jw8xGnbomEknXAEcAu0paQdb76gLgekmnA08DH0i73wYcBywD/gKcBpASxnnA/Wm/cyOi0oD/SbKeYS3AL9LDzMyG0KhbIbGjoyM8jbyZWf9IWhQRHT295ylSzMysECcSMzMrxInEzMwKcSIxM7NCnEjMzKwQJxIzMyvEicTMzApxIjEzs0KcSMzMrBAnEjMzK8SJxMzMCnEiMTOzQpxIzMysECcSMzMrxInEzMwKcSIxM7NCnEjMzKwQJxIzMyvEicTMzApxIjEzs0KcSMzMrBAnEjMzK8SJxMzMCnEiMTOzQpxIzMysECcSMzMrxInEzMwKcSIxM7NCnEjMzKwQJxIzMyukYYlE0uckLZH0iKRrJG0jaaKkBZKWSbpO0ti079Zpe1l6f0LV58xM5UslTW3U9ZiZjVYNSSSS2oFPAx0R8UagCTgF+DpwYUS8AVgDnJ4OOR1Yk8ovTPshad903H7AscD3JDUN5bWYmY12jby1tRXQImkrYFtgJXAUcGN6/0pgWnp9QtomvX+0JKXyayPi1Yj4PbAMOHiI4jczMxqUSCKiE/gG8AxZAnkJWASsjYgNabcVQHt63Q4sT8duSPvvUl3ewzGvkXSGpIWSFq5evXrwL8jMbBRr1K2tnchqExOB3YHtyG5N1UVEzImIjojoaGtrq9dpzMxGpUbd2noX8PuIWB0R64G5wOFAa7rVBTAe6EyvO4E9ANL7rwNeqC7v4RgzMxsCjUokzwCHSto2tXUcDTwK3AmclPaZDvwsvb45bZPevyMiIpWfknp1TQQmAfcN0TWYmRlZg/eQi4gFkm4EfgdsABYDc4BbgWslfSWVXZoOuRT4kaRlwItkPbWIiCWSridLQhuAMyNi45BejJnZKKfsD/vRo6OjIxYuXNjoMMzMSkXSoojo6Ok9j2w3M7NCnEjMzKwQJxIzMyvEicTMzApxIjEzs0KcSMzMrBAnEjMzK8SJxMzMCnEiMTOzQpxIzMysECcSMzMrxInEzMwKcSIxM7NCnEjMzKyQXOuRSDocOBvYKx0jICLi9fULzczMyiDvwlaXAp8DFgFeOMrMzF6TN5G8FBG/qGskZmZWSnkTyZ2SZgNzgVcrhRHxu7pEZWZmpZE3kRySnquXWQzgqMENx8zMyiZXIomII+sdiJmZlVOu7r+Sxkm6VNIv0va+kk6vb2hmZlYGeceRXAHMA3ZP208An61HQGZmVi55E8muEXE9sAkgIjbgbsBmZkb+RPJnSbuQNbAj6VDgpbpFZWZmpZG319bngZuBvSXdA7QBJ9UtKjMzK428iWQx8E5gMtn0KEvxPF1mZkb+ZHBpRGyIiCUR8QgwFritjnGZmVlJ5E0knZK+ByBpJ2A+8OO6RWVmZqWRK5FExD8Dr0j6AfAr4JsRcXldIzMzs1Los41E0olVmwuAfwbuA0LSiRExt57BmZnZ8Fersf34btuLgeZUHmSTOJqZ2SjWZyKJiNPqdWJJrcAlwBvJktLHyXqDXQdMAP4AfCAi1kgScDFwHPAX4NTKzMOSpgNfTh/7lYi4sl4xm5nZlvLOtTVe0k8lrUqPn0gaX/DcFwO/jIh9gAOAx4CzgNsjYhJwe9oGeA8wKT3OAL6f4toZmEU2O/HBwKzUGcDMzIZI3l5bl5MNSNw9PX6eygZE0uuAd5CtvEhE/DUi1gInAJUaxZXAtPT6BOCqyNwLtEraDZgKzI+IFyNiDVlvsmMHGpeZmfVf3kTSFhGXp7EkGyLiCrLR7QM1EVgNXC5psaRLJG0HjIuIlWmf54Bx6XU7sLzq+BWprLfyLiSdIWmhpIWrV68uELaZmXWXN5G8IOmjkprS46PACwXOuxVwEPD9iJgC/JnNt7EAiIggze1VVETMiYiOiOhoayuS/8zMrLu8ieTjwAfIagkryebZOrXAeVcAKyJiQdq+kSyxPJ9uWZGeV6X3O4E9qo4fn8p6KzczsyGSN5GMj4j3R0RbRPxNREwD9hzoSSPiOWC5pMmp6GjgUbJ2mOmpbDrws/T6ZuBjyhwKvJRugc0DjpG0U2pkPyaVmZnZEMk7aeN3yGoMtcr641PA1ZLGAk8Bp5EltuvT6otPk9WCIJvX6zhgGVn339MAIuJFSecB96f9zo2IFwvEZGZm/VRrZPthwFuBNkmfr3prR6CpyIkj4gGgo4e3ju5h3wDO7OVzLgMuKxKLmZkNXK0ayVhg+7TfDlXlL+P1SMzMjNoj2+8C7pJ0RUQ8PUQxmZlZieSd/ddJxMzMeuRVDs3MrJA+E4mkr6fnk4cmHDMzK5taNZLj0sy7M4ciGDMzK59avbZ+CawBtpf0MiCyaUtE1it3xzrHZ2Zmw1yfNZKImBERrcCtEbFjROxQ/TxEMZqZ2TCWa2R7RJwgaRzwllS0ICI8ja6ZmeVe2OpksrXaTyabtuQ+SR6QaGZmuefa+jLwlohYBSCpDfg12ay9ZmY2iuUdRzKmkkSSF/pxrJmZjWB5ayS/lDQPuCZtf5BsRl4zMxvl8ja2z5B0IvC2VDQnIn5av7DMzKws8tZIiIi5wNw6xmJmZiXkdg4zMyvEicTMzArJO47kzT2UvW/wwzEzs7LJWyP5oaQ3VjYkfQj45/qEZGZmZZK3sf0k4EZJHwbeDnwMOKZuUZmZWWnk7f77lKRTgJuAZ4BjImJdXSMzM7NS6DORSHqYbNr4ip2BJmCBJCLiTfUMzszMhr9aNRI3qJuZWZ/6TCQR8XTltaQmYFytY8zMbHTJlRQkfQqYBTwPbErFAfjWlpnZKJe3dvEZYHJEvFDPYMzMrHzyjiNZDrxUz0DMzKyc8tZIngJ+I+lW4NVKYUR8qy5RlcBNizuZPW8pz65dx+6tLcyYOplpU9obHZaZ2ZDLm0ieSY+x6TGq3bS4k5lzH2bd+o0AdK5dx8y5DwM4mZjZqJN3QOI59Q6kTGbPW/paEqlYt34js+ctdSIxs1En76SNbZJmS7pN0h2VR9GTS2qStFjSLWl7oqQFkpZJuk7S2FS+ddpelt6fUPUZM1P5UklTi8aUx7Nrex7U31u5mdlIlrex/WrgcWAicA7wB+D+QTj/Z4DHqra/DlwYEW8A1gCnp/LTgTWp/MK0H5L2BU4B9gOOBb6XxrvU1e6tLf0qNzMbyfImkl0i4lJgfUTcFREfB44qcmJJ44H3ApekbaXPvDHtciUwLb0+IW2T3j867X8CcG1EvBoRvweWAQcXiSuPGVMn09LcNV+1NDcxY+rkep/azGzYydvYvj49r5T0XuBZsnm3irgI+CKwQ9reBVgbERvS9gqg0uDQTtYFmYjYIOmltH87cG/VZ1Yf8xpJZwBnAOy5554Fw97coO5eW2Zm+RPJVyS9DvgC8B1gR+BzAz1pWhRrVUQsknTEQD8nr4iYA8wB6OjoiBq75zJtSrsTh5kZORJJanOYFBG3kA1KPHIQzns48H5JxwHbkCWmi4FWSVulWsl4oDPt3wnsAayQtBXwOuCFqvKK6mPMzGwI1GwjiYiNwIcG86QRMTMixkfEBLLG8jsi4iPAnWSLaAFMB36WXt+ctknv3xERkcpPSb26JgKTgPsGM1YzM+tb3ltb90j6LnAd8OdKYUT8bpDj+RJwraSvAIuBS1P5pcCPJC0DXiRLPkTEEknXA48CG4AzU+IzM7MhouwP+xo7SXf2UBwRUajnViN0dHTEwoULGx2GmVmpSFoUER09vZd3ZPtgtIuYmdkIlHcciZmZWY+cSMzMrBAnEjMzK6TfiUTSnHoEYmZm5TSQGkmPrfZmZjY65R1HUm3VoEdRQl4h0cws0+9EEhHH1iOQMrlpcSczbniQ9ZuyMTida9cx44YHgS1XSHTCMbORzo3tA3D2zUteSyIV6zcFZ9+8pEtZZUnezrXrCDYvyXvTYk8HZmYjhxPJAKxdtz5XeV9L8pqZjRQ1E0laDnfAU8aPZl6S18xGg4bM/lt2O23bnKvcS/Ka2Wgw3Gb/LYVZx+/HjBsfZP3Gze0kzU3ivW/ajcMvuOO1hvUj92njJ4s6u9ze8pK8ZjbS5E0kB6bnc6vKgoLrtpdVT0vtHrlPG9fdt7xLT67r7lvOBw/egzsfX+1eW2Y2YuWaRn4kqdc08gee86seG+FbW5p5YNYxg34+M7OhNOBp5CXtmfMcayPi5X5HNoLk7cllZjbS1Lq1dSXZLSz1sU8AVwBXDVJMZmZWIn0mEi9old9O2zaz5i9b1j566+FlZjZSeEDiIJl1/H40N3WtuDU3iVnH79egiMzMhoYTyQDdtLiTwy+4g4ln3crhF9wBwOyTDqC9tQUB7a0tzD7pAPfQMrMRbyCz/456lTm0KuNDKnNonX/i/txz1qjsEW1mo5hrJAPgObTMzDZzIhkAz6FlZraZE8kAeA4tM7PNnEgG4Mh92vpVbmY2kjmRDMAtD67MXd69d5cXtTKzkca9tgYg73QovfXugi2X5DUzKyvXSOrIvbvMbDRwIhmAMb3MPNa93L27zGw0cCIZgE29zLzfvdy9u8xsNGhIIpG0h6Q7JT0qaYmkz6TynSXNl/Rket4plUvStyUtk/SQpIOqPmt62v9JSdOHIv72XhJB9/IZUyfT0tzUpcwrJJrZSNOoGskG4AsRsS9wKHCmpH2Bs4DbI2IScHvaBngPMCk9zgC+D1niAWYBhwAHA7MqyaeeZkydTHO3+1jNY7RFgpg2pZ3zT9y/y/xb55+4vxvazWxEaUivrYhYCaxMr/8k6TGgHTgBOCLtdiXwG+BLqfyqyJZzvFdSq6Td0r7zI+JFAEnzgWOBa+p+Ed3bSXppN5k2pd2Jw8xGtIa3kUiaAEwBFgDjUpIBeA4Yl163A8urDluRynorr6vZ85ayfmPXBpH1G8O9scxsVGroOBJJ2wM/AT4bES9Lm/+sj4iQNCgLyks6g+yWGHvumXf14N519tLrqnPtOg6/4A6eXbuO3VtbmDF1smsjZjbiNaxGIqmZLIlcHRFzU/Hz6ZYV6XlVKu8E9qg6fHwq6628i4iYExEdEdHR1lZ8GpO+1h3uXLuOSM8zbnhwwCPZPSLezMqiUb22BFwKPBYR36p662ag0vNqOvCzqvKPpd5bhwIvpVtg84BjJO2UGtmPSWV1lbeatH5TcPbNS/r9+ZUR8dVJaebch51MzGxYalSN5HDgH4CjJD2QHscBFwDvlvQk8K60DXAb8BSwDPgh8EmA1Mh+HnB/epxbaXgfLnqbTqUvHhFvZmXSqF5b/4/e7xAd3cP+AZzZy2ddBlw2eNHVttO2zaz5S/8TRF4eEW9mZdLwXltlNOv4/Whu6qulpBiPiDezMnEiyaF7wzfA7JMO6DLQcDB5RLyZlYmnka+ht6ngzz9xf+4566jX9pty7q96vN2107bN/T5npcvw7HlL3ZXYzIY9J5Ia+mr4rv6Hfdbx+zHjxge7DFRsbhKzjt9vQOf1iHgzKwvf2qqhtwbuyuDDvm53zT7pACcDMxvxXCOpYffWlj5HsleeZ9zwILNPPqDL7a7e3LS407etzGzEcCKp4ch92vjxvc/U3K8y+LB7QuieNI7cp42fLOr08rtmNmL41lYNdz6+Ove+va3ZXj1C/ep7n/FgQzMbUZxIaujttlYePTXU9za9igcbmllZ+dZWDU0SGyPf7FrNY+gy+29/ktAYiYln3eo2EzMrHddIasibRADWb+o6+29vY997Kt8Y4QkazayUnEhqKDJqPdgyabQ0N/GRQ/d8rZtwk7ZMK24zMbMy8a2tGvL22upNkCWj3rr6Tjzr1h6Pc5uJmZWFayQ19KfXVk+aa3zDnqDRzMrOiaSGojWD7u0m3VdN9ASNZlZ2TiQ1DHbNoPuqidOmtHP+ift3mVrl/BP3d68tMysNt5HUMGPqZGbc8CDrN+XvvVVL94GLnqDRzMrMNZIcBjOJmJmNNK6R1DBz7kN1+dzqgYsegGhmZeZEUsO69Zvq8rnVMwd70kYzKzPf2hoGPADRzMrMiWSY8ABEMysrJ5JhwgMQzays3EZSQ3s/Z/EdiDGQawCiV1Y0s+HINZIaJuxS/5rCJmDh0y/2uU9Pi2R5lmAzGw5cI6nh3qfWDMl5fnzvM1x97zOv1TSALrWPP7+6odeVFV0rMbNGciKpoT/rkRT12nxcNz4IsXkgZF+31txIb2aN5kRSQ39WSBws6zfmP58b6c2s0ZxIajj09Ttxz3/13X7RSIPZhuPGfDMbCCeSGoZzEgH4j0GKr9KYX2mH8Yh7M8tLMcS3bRqto6MjFi5cmHv/Cb2sYDicjdthLM//6a9dtmcet2+X2saR+7Rx5+OruzTmd5+VGKC1pZnttt6qSy0FcM3FbJSRtCgiOnp8r+yJRNKxwMVAE3BJRFzQ1/6jIZGUxbgdxvLu/f6WaxYsZ2METRIfOmQPbl7cycuvbu6htuPWTew//nVdaoeH770zE9u23+LYjr12rmuS6+n2H5QnsZY9/kYYCbd8B+MaRmwikdQEPAG8G1gB3A98KCIe7e0YJ5KRbYygetb/luamQVsorPvtP4DmMQJ17SAxmOccTGWPvxF6+s7K9v0M1jX0lUjKPiDxYGBZRDwVEX8FrgVOaHBM1kDdl44ZzAkxZ89busVYnvWbYotedsN1Es6yx98IPX1nZft+huIayp5I2oHlVdsrUlkXks6QtFDSwtWrVw9ZcDY8DNZYm/58znAc31P2+Buht++hTN/PUFxD2RNJLhExJyI6IqKjra2tX8fuuHVTnaKyoTJYY2368znDcXxP2eNvhN6+hzJ9P0NxDWVPJJ3AHlXb41PZoHnonGOdTEpkjLputzQ35ZoQM48ZUyfT0tz1t9A8RjQ3dT3pYJ5zMJU9/kbo6Tsr2/czFNdQ9nEk9wOTJE0kSyCnAB8e7JM8dM6xW5S5Eb64svXaqnxOWXs9lT3+RujtOyvT9zMU11DqXlsAko4DLiLr/ntZRHy1r/3722vLzMz67rVV9hoJEXEbcFuj4zAzG63K3kZiZmYN5kRiZmaFOJGYmVkhTiRmZlZI6Xtt9Zek1cDTOXffFfhjHcOpN8ffWGWPH8p/DY5/8OwVET2O6B51iaQ/JC3srbtbGTj+xip7/FD+a3D8Q8O3tszMrBAnEjMzK8SJpG9zGh1AQY6/scoeP5T/Ghz/EHAbiZmZFeIaiZmZFeJEYmZmhTiR9EDSsZKWSlom6axGx5OHpMskrZL0SFXZzpLmS3oyPe/UyBj7ImkPSXdKelTSEkmfSeWluAZJ20i6T9KDKf5zUvlESQvSb+k6SWMbHWtfJDVJWizplrRdmvgl/UHSw5IekLQwlZXi91MhqVXSjZIel/SYpMPKcA1OJN1IagL+FXgPsC/wIUn7NjaqXK4Aui+cchZwe0RMAm5P28PVBuALEbEvcChwZvrey3INrwJHRcQBwIHAsZIOBb4OXBgRbwDWAKc3MMY8PgM8VrVdtviPjIgDq8ZelOX3U3Ex8MuI2Ac4gOy/xfC/hojwo+oBHAbMq9qeCcxsdFw5Y58APFK1vRTYLb3eDVja6Bj7cS0/A95dxmsAtgV+BxxCNip5q1Te5bc13B5kK4zeDhwF3AKoZPH/Adi1W1lpfj/A64DfkzpBlekaXCPZUjuwvGp7RSoro3ERsTK9fg4Y18hg8pI0AZgCLKBE15BuCz0ArALmA/8FrI2IDWmX4f5bugj4IrApbe9CueIP4FeSFkk6I5WV5vcDTARWA5en24uXSNqOElyDE8koEdmfM8O+r7ek7YGfAJ+NiJer3xvu1xARGyPiQLK/7A8G9mlwSLlJeh+wKiIWNTqWAt4WEQeR3ZY+U9I7qt8c7r8fsoUGDwK+HxFTgD/T7TbWcL0GJ5ItdQJ7VG2PT2Vl9Lyk3QDS86oGx9MnSc1kSeTqiJibikt1DQARsRa4k+xWUKukykqkw/m3dDjwfkl/AK4lu711MeWJn4joTM+rgJ+SJfMy/X5WACsiYkHavpEssQz7a3Ai2dL9wKTUW2UscApwc4NjGqibgenp9XSydodhSZKAS4HHIuJbVW+V4hoktUlqTa9byNp3HiNLKCel3YZt/BExMyLGR8QEst/8HRHxEUoSv6TtJO1QeQ0cAzxCSX4/ABHxHLBc0uRUdDTwKCW4Bo9s74Gk48juFzcBl0XEVxscUk2SrgGOIJt2+nlgFnATcD2wJ9nU+R+IiBcbFWNfJL0NuBt4mM336P8PWTvJsL8GSW8CriT7zYwBro+IcyW9nuwv/J2BxcBHI+LVxkVam6QjgH+KiPeVJf4U50/T5lbAv0fEVyXtQgl+PxWSDgQuAcYCTwGnkX5PDONrcCIxM7NCfGvLzMwKcSIxM7NCnEjMzKwQJxIzMyvEicTMzApxIrHSkfTpNDPq1f045mxJnWlm2AckXVDPGIcLSW9PsxE/kMa3VMpbJX0yx/FHVGYCNuvNVrV3MRt2Pgm8KyJW9PO4CyPiG729KakpIjYWC23Y+QhwfkT8uFt5K9n3+L2hD8lGGtdIrFQk/QB4PfALSZ8bhM/7g6SvS/odcLKkvSX9Mk38d7ekfdJ+EyX9Z1rv4iuSXknlXf5il/RdSaem12+WdFf6rHlV01z8Jp3zPklPSHp7Km+S9A1Jj0h6SNKnJB0l6aaqz3+3pJ/SjaSj00R/Dytbm2ZrSZ8APgCc10Pt7QJg71RTma3M7HTuhyV9sIdzvCWdY+80kvyydA2LJZ2Q9jlV0tz0HT4p6V+K/Pexkmj09MN++NHfBz1MF57jmLPJ5ol6ID2mVn3WF6v2ux2YlF4fQjZVCGTTVHwsvT4TeCW9PgK4per47wKnAs3AfwBtqfyDZLMkAPwG+GZ6fRzw6/T6f5HNr1SZtn1nsqncH6/6nH8Hju92bduQzVj9P9L2VWSTXkK2Ts1JPXwfE+i65MDfk81Y3EQ2u+wzZFOWH0E2pfxbgUXAnmn/r5GNcoesdvMEsF269qfIpkTfhmwk9h6N/s34Ud+Hb23ZaNLbra3r4LWZh98K3JBN/QXA1un5cLJ/bAF+RLbgU18mA28E5lkIpHAAAAInSURBVKfPagJWVr1fmZRyEdk/6gDvAn4Qadr2SNNgSPoR8FFJl5NNBPmxHs71+4h4Im1fSZbsLqoRY7W3AddEdmvveUl3AW8BXgb+DpgDHBMRz6b9jyGb5PGf0vY2ZFN4QLYI00sp9keBvei6NIONME4kNmKkf2inAM9GxHH9OPTP6XkM2fobB/ayX0/zCW2g6y3ibSrhAEsi4rBePqsyX9VGav9/eDnwc+C/gRti8/ogQ2Ul2XVNASqJRMDfR8TS6h0lHcLma4N812cl5zYSGzEi4rTIllntTxKpPv5l4PeSToZsRmJJB6S37yGbFReyBuyKp4F9U5tEK9mMrZCtatcm6bD0Wc2S9qsRwnzgfypN2y5p5xTXs2T/gH+ZLKl0txSYIOkNafsfgLtqnOtPwA5V23cDH0ztNG3AO4D70ntrgfcC56cJHQHmAZ9Sqm5JmlLjfDaCOZGYdfUR4HRJDwJLgBNS+WfIFkt6mKpVAiNiOdnMrI+k58Wp/K9k069/PX3WA2S3zfpyCVnbxEPpmA9XvXc1sDwiHut+UET8N9kssTek+DYBP+jrRBHxAnBPalyfTTZz7kPAg8AdZO1Gz1Xt/zzwPuBfU63jPLJ2oIckLUnbNkp59l+zAZD0SkRsP4Tn+y6wOCIuHapzmuXlRGI2AEOZSCQtImvHeXcMw7VAzJxIzMysELeRmJlZIU4kZmZWiBOJmZkV4kRiZmaFOJGYmVkh/x9AZvdC6btnfwAAAABJRU5ErkJggg==\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"length = [len(item[0]) for item in sorted_token_count]\n",
"frequency = [item[1] for item in sorted_token_count]\n",
"plt.scatter(length, frequency)\n",
"\n",
"plt.xlabel('f - Frequency of token')\n",
"plt.ylabel('|r - rank of token')\n",
"plt.title('Zipf\\'s Law')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"from random import choice\n",
"from nltk.corpus import wordnet"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"def num_meanings(word):\n",
" synonyms = []\n",
" antonyms = []\n",
"\n",
" for syn in wordnet.synsets(word):\n",
" for l in syn.lemmas():\n",
" synonyms.append(l.name())\n",
" if l.antonyms():\n",
" antonyms.append(l.antonyms()[0].name())\n",
"\n",
" return len(list(set(synonyms)))"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('twitter', 1760),\n",
" ('hi', 1801),\n",
" ('he', 557),\n",
" ('at', 566),\n",
" ('sir', 691),\n",
" ('me', 1955),\n",
" ('log', 448),\n",
" ('of', 1074),\n",
" ('india', 388),\n",
" ('time', 385)]"
]
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"choose = sorted_token_count[:100]\n",
"l1 = ['twitter', 'hi', 'he', 'at', 'sir', 'me','log', 'of', 'india','time']\n",
"lst = []\n",
"for k in l1:\n",
" item = (k, token_count[k])\n",
" lst.append(item)\n",
"# lst.sort(key=lambda x:x[1], reverse=True)\n",
"lst"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"num_meanings('hello')"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[3, 9, 4, 4, 2, 3, 3, 0, 3, 9]"
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"meanings = {'to':4,'the':4 ,'raha':2, 'diya':3, 'my':2, 'for':3, 'com':3, 'do':2, 'india':1,'time':1}\n",
"# m = [meanings[lst[i][0]] for i in range(10)]\n",
"m = [num_meanings(lst[i][0]) for i in range(10)]\n",
"f = [lst[i][1] for i in range(10)] \n",
"plt.scatter(m, f)\n",
"plt.xlabel('m - Number of meanings')\n",
"plt.ylabel('f - Frequency of the token')\n",
"plt.title('Zipf\\'s Law')\n",
"m\n",
"# lst"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3. Heaps' Law"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"vsize = 0\n",
"num_tokens = 0\n",
"unique_tokens = []\n",
"V = []\n",
"N = []\n",
"\n",
"for i in range(len(tokens)):\n",
" s = tokens[i]\n",
" if s not in unique_tokens:\n",
" unique_tokens.append(s)\n",
" vsize += 1\n",
" V.append(vsize)\n",
" N.append(i+1)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"N = np.array(N)\n",
"V = np.array(V)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Text(0.5, 1.0, \"Heaps' Law\")"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plt.plot(N, V)\n",
"plt.xlabel('N - Number of tokens')\n",
"plt.ylabel('|V| - Size of vocabulary')\n",
"plt.title('Heaps\\' Law')"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1000 610\n",
"10000 3597\n",
"20000 5920\n",
"30000 7753\n",
"50000 10806\n",
"100000 16675\n"
]
}
],
"source": [
"for i in [1000, 10000, 20000, 30000, 50000, 100000]:\n",
" print(i, V[i-1])"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"from math import log"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.3188109468386284\n",
"0.6824809994294719\n",
"1.3976147883124628\n",
"0.655343979207257\n",
"1.5589214096765935\n",
"0.6405478197637083\n"
]
}
],
"source": [
"print(8385/6358)\n",
"print(log(1.3188, 3/2))\n",
"print(11719/8385)\n",
"print(log(1.397614, 5/3))\n",
"print(18269/11719)\n",
"print(log(1.558921, 10/5))"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"10.340940789558791"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"11719 / 50000 ** 0.65"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Text(0.5, 1.0, \"Heaps' Law\")"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plt.plot(N, V)\n",
"\n",
"k = 10.34\n",
"beta = 0.64\n",
"plt.plot(N, k * (N**beta))\n",
"plt.xlabel('N - Number of tokens')\n",
"plt.ylabel('|V| - Size of vocabulary')\n",
"plt.title('Heaps\\' Law')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
"import pandas as pd\n",
"import re\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"from bs4 import BeautifulSoup"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv('tweets-dataset.csv')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Sentence</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <t{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ASSIGNMENT 1\n",
"NLP\n",
"Akhilesh Ravi\n",
"16110007"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import re\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"from bs4 import BeautifulSoup"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv('tweets-dataset.csv')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Sentence</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>@BubblyDentist @MeetUunngLee nahi nahi, mere s...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>going to the grammys first entertainment law b...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>bohot hi badiya ji aap sunao?</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Parvez Musharraf is Digvijay Singh of Pakistan...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>guddu ko bass john cena k sticker ki padii hai...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Aman ki maa ki... Asha https://twitter.com/ash...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>pakistan can wait more more and more . . . ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>@sagarcasm Jai Mahesh !!</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>RT AAP_Najafgarh \" RT AAPInNews : When DDCA lo...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>\"Kaam ho jayega, thoda kharcha paani lagega\" \\...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Sentence\n",
"0 @BubblyDentist @MeetUunngLee nahi nahi, mere s...\n",
"1 going to the grammys first entertainment law b...\n",
"2 bohot hi badiya ji aap sunao?\n",
"3 Parvez Musharraf is Digvijay Singh of Pakistan...\n",
"4 guddu ko bass john cena k sticker ki padii hai...\n",
"5 Aman ki maa ki... Asha https://twitter.com/ash...\n",
"6 pakistan can wait more more and more . . . ...\n",
"7 @sagarcasm Jai Mahesh !!\n",
"8 RT AAP_Najafgarh \" RT AAPInNews : When DDCA lo...\n",
"9 \"Kaam ho jayega, thoda kharcha paani lagega\" \\..."
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[:10]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# s = re.sub('[^0-9a-zA-Z]+', '*', s)\n",
"# https://stackoverflow.com/questions/12985456/replace-all-non-alphanumeric-characters-in-a-string"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def cleanText(raw_text):\n",
" '''\n",
" Convert a raw review to a cleaned review\n",
" '''\n",
" text = BeautifulSoup(raw_text, 'html').get_text() #remove html\n",
" words = text.split()\n",
" words = [w for w in words if '@' not in w and '#' not in w] # remove the @-words and #-words\n",
" text = ' '.join(words)\n",
" letters_only = re.sub('[^a-zA-Z]+', ' ', text) # remove non-character\n",
" \n",
" return( letters_only.lower())\n",
"\n",
"vclean = np.vectorize(cleanText)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Sentence</th>\n",
" <th>Cleaned sentence</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>@BubblyDentist @MeetUunngLee nahi nahi, mere s...</td>\n",
" <td>nahi nahi mere saath jaakar pachtaogi ye uunng...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>going to the grammys first entertainment law b...</td>\n",
" <td>going to the grammys first entertainment law b...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>bohot hi badiya ji aap sunao?</td>\n",
" <td>bohot hi badiya ji aap sunao</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Parvez Musharraf is Digvijay Singh of Pakistan...</td>\n",
" <td>parvez musharraf is digvijay singh of pakistan...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>guddu ko bass john cena k sticker ki padii hai...</td>\n",
" <td>guddu ko bass john cena k sticker ki padii hai...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Aman ki maa ki... Asha https://twitter.com/ash...</td>\n",
" <td>aman ki maa ki asha https twitter com ashabhos...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>pakistan can wait more more and more . . . ...</td>\n",
" <td>pakistan can wait more more and more aakhir pa...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>@sagarcasm Jai Mahesh !!</td>\n",
" <td>jai mahesh</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>RT AAP_Najafgarh \" RT AAPInNews : When DDCA lo...</td>\n",
" <td>rt aap najafgarh rt aapinnews when ddca lowere...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>\"Kaam ho jayega, thoda kharcha paani lagega\" \\...</td>\n",
" <td>kaam ho jayega thoda kharcha paani lagega sir...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Sentence \\\n",
"0 @BubblyDentist @MeetUunngLee nahi nahi, mere s... \n",
"1 going to the grammys first entertainment law b... \n",
"2 bohot hi badiya ji aap sunao? \n",
"3 Parvez Musharraf is Digvijay Singh of Pakistan... \n",
"4 guddu ko bass john cena k sticker ki padii hai... \n",
"5 Aman ki maa ki... Asha https://twitter.com/ash... \n",
"6 pakistan can wait more more and more . . . ... \n",
"7 @sagarcasm Jai Mahesh !! \n",
"8 RT AAP_Najafgarh \" RT AAPInNews : When DDCA lo... \n",
"9 \"Kaam ho jayega, thoda kharcha paani lagega\" \\... \n",
"\n",
" Cleaned sentence \n",
"0 nahi nahi mere saath jaakar pachtaogi ye uunng... \n",
"1 going to the grammys first entertainment law b... \n",
"2 bohot hi badiya ji aap sunao \n",
"3 parvez musharraf is digvijay singh of pakistan... \n",
"4 guddu ko bass john cena k sticker ki padii hai... \n",
"5 aman ki maa ki asha https twitter com ashabhos... \n",
"6 pakistan can wait more more and more aakhir pa... \n",
"7 jai mahesh \n",
"8 rt aap najafgarh rt aapinnews when ddca lowere... \n",
"9 kaam ho jayega thoda kharcha paani lagega sir... "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data['Cleaned sentence'] = vclean(data['Sentence'])\n",
"data[:10]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 nahi nahi mere saath jaakar pachtaogi ye uunng...\n",
"1 going to the grammys first entertainment law b...\n",
"2 bohot hi badiya ji aap sunao \n",
"3 parvez musharraf is digvijay singh of pakistan...\n",
"4 guddu ko bass john cena k sticker ki padii hai...\n",
"5 aman ki maa ki asha https twitter com ashabhos...\n",
"6 pakistan can wait more more and more aakhir pa...\n",
"7 jai mahesh \n",
"8 rt aap najafgarh rt aapinnews when ddca lowere...\n",
"9 kaam ho jayega thoda kharcha paani lagega sir...\n",
"Name: Cleaned sentence, dtype: object"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[:10]['Cleaned sentence']"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"def tokenize(s):\n",
" return tuple(s.split())"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of tokens: 303161\n",
"Number of word types: 32707\n"
]
}
],
"source": [
"tokens = []\n",
"for i in range(len(data)):\n",
" tokens.extend( tokenize( data.iloc[i]['Cleaned sentence']) )\n",
"sorted_tokens = sorted(tokens)\n",
"word_types = list(set(tokens))\n",
"print('Number of tokens:', len(tokens))\n",
"print('Number of word types:', len(word_types))"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TTR: 0.10788656852299604\n"
]
}
],
"source": [
"print('TTR:', len(word_types)/len(tokens))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2. Zipf's Law"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"token_count = {}\n",
"for s in sorted_tokens:\n",
" if s in token_count:\n",
" token_count[s] += 1\n",
" else:\n",
" token_count[s] = 1"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Top 50 Word types in decreasing order of number of occurences:\n",
"\n"
]
},
{
"data": {
"text/plain": [
"[('hai', 10030),\n",
" ('to', 4154),\n",
" ('ki', 3224),\n",
" ('ke', 3170),\n",
" ('nahi', 3169),\n",
" ('bhi', 2929),\n",
" ('the', 2866),\n",
" ('se', 2601),\n",
" ('ho', 2365),\n",
" ('ka', 2310),\n",
" ('bhai', 2266),\n",
" ('ko', 2208),\n",
" ('me', 1955),\n",
" ('ye', 1869),\n",
" ('kya', 1815),\n",
" ('hi', 1801),\n",
" ('aur', 1797),\n",
" ('twitter', 1760),\n",
" ('com', 1724),\n",
" ('kar', 1681),\n",
" ('i', 1509),\n",
" ('in', 1387),\n",
" ('t', 1319),\n",
" ('https', 1310),\n",
" ('is', 1296),\n",
" ('mein', 1276),\n",
" ('a', 1202),\n",
" ('ek', 1165),\n",
" ('and', 1126),\n",
" ('status', 1108),\n",
" ('of', 1074),\n",
" ('on', 1071),\n",
" ('na', 1026),\n",
" ('s', 1009),\n",
" ('ab', 969),\n",
" ('toh', 963),\n",
" ('rt', 944),\n",
" ('tha', 937),\n",
" ('http', 905),\n",
" ('for', 885),\n",
" ('you', 885),\n",
" ('aaj', 873),\n",
" ('co', 872),\n",
" ('raha', 868),\n",
" ('par', 826),\n",
" ('ne', 824),\n",
" ('aap', 820),\n",
" ('hain', 816),\n",
" ('koi', 802),\n",
" ('kuch', 801)]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted_token_count = list(token_count.items())\n",
"sorted_token_count.sort(key=lambda x:x[1], reverse=True)\n",
"print('Top 50 Word types in decreasing order of number of occurences:\\n')\n",
"sorted_token_count[:50]"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Text(0.5, 1.0, \"Zipf's Law\")"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"length = [len(item[0]) for item in sorted_token_count]\n",
"frequency = [item[1] for item in sorted_token_count]\n",
"plt.scatter(length, frequency)\n",
"\n",
"plt.xlabel('f - Frequency of token')\n",
"plt.ylabel('|r - rank of token')\n",
"plt.title('Zipf\\'s Law')"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"from random import choice"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('to', 4154),\n",
" ('the', 2866),\n",
" ('com', 1724),\n",
" ('for', 885),\n",
" ('raha', 868),\n",
" ('do', 684),\n",
" ('diya', 452),\n",
" ('my', 424),\n",
" ('india', 388),\n",
" ('time', 385)]"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"choose = sorted_token_count[:100]\n",
"l1 = ['to', 'the', 'raha', 'diya', 'my', 'for','com', 'do', 'india','time']\n",
"lst = []\n",
"for k in l1:\n",
" item = (k, token_count[k])\n",
" lst.append(item)\n",
"lst.sort(key=lambda x:x[1], reverse=True)\n",
"lst\n"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Text(0.5, 1.0, \"Zipf's Law\")"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEWCAYAAACXGLsWAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3de5hdVX3/8feHIci0XAbMmOamgZgGoWISj1yKrQhqAC1JKVWUaqBorOJPqTZCLE8Br9iUi9QCBqFcRCGFGCKCMUJEpAJOSExIMDLl0mRAEggJoPlhEr79Y6+Bk2HmnD2TOXP2zPm8nmc/s/fat+/mkPM9e62111ZEYGZmVsku9Q7AzMyKz8nCzMyqcrIwM7OqnCzMzKwqJwszM6vKycLMzKpysjDrgaTbJc3Iue0IST+T9LykC2odm9lA27XeAZjVg6STgW91s+qPgXMi4osRcWwvDjkTeBrYKyJC0rkAEXFuL+MKYEJEtPdmP7Na852FNaSIuD4i9iifgDOAp4Ar+nDINwCrw0+52hDlZGEGSJoMXAycFBFPprKfSvpomj9F0j2Svilps6RfSzo6rbsamAF8XtILkt7V5djDJd0qaZOkjZLultSrf3uSxku6U9Izkp6WdL2klrTuVEk/KNv2YUn/Vba8VtKkPv2HMUtcDWUNL33p3gR8KSJ+WmHTQ9N2w4ETgPmS9ouIUyQBrIuIs9O2Pynb73PAOqA1LR8G9PYORMDXgJ8BewE3A+eS3Q3dBVyUEtCfALsBh6dr2x/YA1jRy/OZ7cB3FtbQlH3LXws8CPxrlc3XAxdHxNaIuBFYA7w3x2m2AiOBN6R97+5tdVVEtEfE4oh4MSI2ABcC70jrHgGeByYBfwksAp6QdEDa5u6IeKk35zPryncW1ujOBA4C3prjC7yjyzaPA6NynGMO2V3Aj9MdyNyIOL83QUoaAXwD+AtgT7Ifes+WbXIXcCTwxjS/iSxRHJ6WzXaK7yysYUk6Evhn4MSI2JRjl9HpTqTT64Enqu0UEc9HxOciYn/geOCzne0dvfBVsqqrN0fEXsDfkVVNdepMFn+R5u8iSxbvwMnC+oGThTUkSSOBG4AzImJZzt1eB3xa0jBJfwu8Cbgtx7neJ+mNKdFsBrYDlaqFdpO0e9nURHY38QKwWdJoYFaXfe4C3gk0R8Q64G7gGOC1QN7rM+uRk4U1qo8BI4BvpB5M5dPlPexzHzCB7HmKr5DdkTyT41wTyBq8XwB+AVwaEUsqbL8K2FI2nQqcB0whSzY/BOaX7xARv0nHvzstPwc8AtwTEdtzxGhWkdwt3Kw6SacAH42It9c7FrN68J2FmZlV5WRhZmZVuRrKzMyq8p2FmZlVNSQfyhs+fHiMGzeu3mGYmQ0qS5cufToiWrtbNySTxbhx42hra6t3GGZmg4qkx3ta52ooMzOrysnCzMyqcrIwM7OqnCzMzKwqJwszM6tqSPaGMjNrJAuWdTBn0Rqe2LSFUS3NzJo6kemTR/frOZwszMwGsQXLOpg9fyVbtmaDC3ds2sLs+SsB+jVhuBrKzGwQm7NozcuJotOWrduZs2hNv57HycLMbBB7YtOWXpX3lZOFmdkgNqqluVflfeVkYWY2iM2aOpHmYU07lDUPa2LW1In9eh43cJuZDWKdjdjuDWVmZhVNnzy635NDVzWvhpLUJGmZpFvT8n6S7pPULulGSbul8tek5fa0flzZMWan8jWSptY6ZjMz29FAtFl8BniobPnrwEUR8UbgWeC0VH4a8Gwqvyhth6QDgZOAg4BjgEsl7VhBZ2ZmNVXTZCFpDPBe4NtpWcBRwE1pk2uA6Wl+WlomrT86bT8NuCEiXoyIR4F24JBaxm1mZjuq9Z3FxcDngZfS8muBTRGxLS2vAzor2kYDawHS+s1p+5fLu9nnZZJmSmqT1LZhw4b+vg4zs4ZWs2Qh6X3A+ohYWqtzlIuIuRFRiohSa2u3bwU0M7M+qmVvqCOA4yUdB+wO7AV8A2iRtGu6exgDdKTtO4CxwDpJuwJ7A8+UlXcq38fMzAZAze4sImJ2RIyJiHFkDdR3RsTJwBLgxLTZDOCWNL8wLZPW3xkRkcpPSr2l9gMmAPfXKm4zM3u1ejxncSZwg6QvA8uAK1P5lcB1ktqBjWQJhohYJWkesBrYBpweEdtffVgzM6sVZT/eh5ZSqRRtbW31DsPMbFCRtDQiSt2t89hQZmZWlZOFmZlV5WRhZmZVOVmYmVlVThZmZlaVk4WZmVXlZGFmZlU5WZiZWVVOFmZmVpWThZmZVeVkYWZmVTlZmJlZVU4WZmZWlZOFmZlV5WRhZmZVOVmYmVlVThZmZlZVzZKFpN0l3S/pV5JWSTovlV8t6VFJy9M0KZVL0iWS2iWtkDSl7FgzJD2cphk9ndPMzGqjlu/gfhE4KiJekDQM+Lmk29O6WRFxU5ftjwUmpOlQ4DLgUEn7AucAJSCApZIWRsSzNYzdzMzK1OzOIjIvpMVhaar0wu9pwLVpv3uBFkkjganA4ojYmBLEYuCYWsVtZmavVtM2C0lNkpYD68m+8O9Lq76SqpoukvSaVDYaWFu2+7pU1lO5mZkNkJomi4jYHhGTgDHAIZL+DJgNHAC8DdgXOLM/ziVppqQ2SW0bNmzoj0OamVkyIL2hImITsAQ4JiKeTFVNLwL/CRySNusAxpbtNiaV9VTe9RxzI6IUEaXW1tZaXIaZWcOqmiwk/amkKyT9WNKdnVOO/VoltaT5ZuDdwK9TOwSSBEwHHky7LAQ+knpFHQZsjogngUXAeyTtI2kf4D2pzMzMBkie3lD/BVwOXAFs78WxRwLXSGoiS0rzIuLWlGxaAQHLgX9I298GHAe0A78HTgWIiI2SvgT8Mm33xYjY2Is4zMxsJymiUgclkLQ0It46QPH0i1KpFG1tbfUOw8xsUEnf96Xu1uVps/iBpE9KGilp386pn2M0M7MCy1MN1fnE9KyysgD27/9wzMysiKomi4jYbyACMTOz4srTG+qPJJ0taW5aniDpfbUPzczMiiJPm8V/An8A/jwtdwBfrllEZmZWOHmSxfiI+FdgK0BE/J6s26uZmTWIPMniD+mhugCQNJ5sRFkzM2sQeXpDnQP8CBgr6XrgCOCUWgZlZmbFkqc31GJJDwCHkVU/fSYinq55ZGZmVhh5ekN9MSKeiYgfRsStwMZ0h2FmZg0iT5vFWEmzAdK7J74PPFzTqMzMrFDyJIu/B96cEsYPgJ9GxLk1jcrMzAqlxzYLSVPKFr8BfAu4B7hL0pSIeKDWwZmZWTFUauC+oMvys8CBqTyAo2oVlJmZFUuPySIi3jmQgZiZWXHl6Q21t6QLO99vLekCSXsPRHBmZlYMeRq4rwKeB96fpufIxosyM7MGkecJ7vER8Tdly+dJWl6rgMzMrHjy3FlskfT2zgVJRwBbqu0kaXdJ90v6laRVks5L5ftJuk9Su6QbJe2Wyl+TltvT+nFlx5qdytdImtrbizQzs52TJ1n8A/Afkh6T9BjwTeDjOfZ7ETgqIt4CTAKOkXQY8HXgooh4I1kPq9PS9qcBz6byi9J2SDoQOAk4CDgGuFRSU87rMzOzfpAnWTyXvvAPBg6OiMlkbRgVReaFtDgsTZ1dbm9K5dcA09P8tLRMWn+0JKXyGyLixYh4FGgHDskRt5mZ9ZM8yeJmgIh4LiKeS2U3Vdj+ZZKaUvvGemAx8D/ApojYljZZB4xO86OBtelc24DNwGvLy7vZp/xcMzt7bG3YsCFPeGZmllOlJ7gPIKv62VvSCWWr9gJ2z3PwiNgOTJLUQjam1AE7EWu1c80F5gKUSqWo1XnMzBpRpd5QE4H3AS3AX5WVPw98rDcniYhNkpYAhwMtknZNdw9jyF7TSvo7FlgnaVdgb+CZsvJO5fuYmdkAqPQE9y3ALZIOj4hf9PbAklqBrSlRNAPvJmu0XgKcCNwAzABuSbssTMu/SOvvjIiQtBD4rqQLgVHABOD+3sZjZmZ9l+flR71OFMlI4JrUc2kXYF5E3CppNXCDpC8Dy4Ar0/ZXAtdJagc2kvWAIiJWSZoHrAa2Aaen6i0zMwMWLOtgzqI1PLFpC6Nampk1dSLTJ7+qaXenKGLoVe+XSqVoa2urdxhmZjW3YFkHs+evZMvWV35DNw9r4msnvLnXCUPS0ogodbcuT28oMzMrqDmL1uyQKAC2bN3OnEVr+vU8eQYSHCHpSkm3p+UDJZ1WbT8zM6u9JzZ1P6BGT+V9lefO4mpgEVnjMsBvgDP6NQozM+uTUS3NvSrvqzzJYnhEzANegpcfmHMDs5lZAcyaOpHmYTuOgNQ8rIlZUyf263nyjDr7O0mvJRuqgzS+0+Z+jcLMzPqksxG71r2h8iSLz5I9AzFe0j1AK9lzEGZmVgDTJ4/u9+TQVZ7nLB6Q9A6yJ7oFrImIrTWNyszMCiXPnQVko7yOS9tPkUREXFuzqMzMrFCqJgtJ1wHjgeW80rAdgJOFmVmDyHNnUQIOjKH4qLeZmeWSp+vsg8Cf1DoQMzMrrkrvs/gBWXXTnsBqSfeTvSoVgIg4vvbhmZlZEVSqhvq3AYvCzMwKrdL7LO4CkPT1iDizfJ2krwN31Tg2MzMriDxtFu/upuzY/g7EzMyKq1KbxSeATwL7S1pRtmpP4J5aB2ZmZsVRqc3iu8DtwNeAs8rKn4+IjTWNyszMCqXHaqiI2BwRj0XEByPi8bIpV6KQNFbSEkmrJa2S9JlUfq6kDknL03Rc2T6zJbVLWiNpaln5MamsXdJZ3Z3PzMxqJ+9wH32xDfhcGltqT2CppMVp3UURsUNvK0kHkr13+yCyd2f8RNKfptX/QdZ2sg74paSFEbG6hrGbmVmZSm0Wr4mIF3taX01EPAk8meafl/QQUGlYxGnADemcj0pqJxuTCqA9Ih5Jcd2QtnWyMDMbIJV6Q/0CXh4baqdIGgdMBu5LRZ+StELSVZL2SWWjgbVlu61LZT2Vdz3HTEltkto2bNiwsyGbmVmZSsliN0kfAv5c0gldp7wnkLQHcDNwRkQ8B1xGNjDhJLI7jwt2Iv6XRcTciChFRKm1tbU/DmlmZkmlNot/AE4GWoC/6rIugPnVDi5pGFmiuD4i5gNExFNl668Abk2LHcDYst3HpDIqlJuZ2QCo9AT3z4GfS2qLiCt7e2BJAq4EHoqIC8vKR6b2DIC/JhuoELK38X1X0oVkDdwTgPvJXrg0QdJ+ZEniJOBDvY3HzMz6Lk9vqOskfRr4y7R8F3B5jrflHQF8GFgpaXkq+wLwQUmTyO5OHgM+DhARqyTNI2u43gacHhHbASR9ClgENAFXRcSqnNdnZmb9QNVeUyHp28Aw4JpU9GFge0R8tMax9VmpVIq2trZ6h2FmNqhIWhoRpe7W5bmzeFtEvKVs+U5Jv+qf0MzMbDDIM5DgdknjOxck7c8rr1c1M7MGkOfOYhawRNIjZI3NbwBOrWlUZmZWKFWTRUTcIWkCMDEVrdmZJ7vNzGzwyTU2VEoOK6puaGZmQ1KeNgszM2twThZmZlZV1WQhab6k90pyYjEza1B5EsClZMNrPCzpfEkTq+1gZmZDS9VkERE/iYiTgSlkw3P8RNJ/Szo1DRRoZmZDXK6qJUmvBU4BPgosA75BljwWV9jNzMyGiKpdZyV9n+wZi+uAvyobMfZGSR6AycysAeR5zuKSiFjS3YqeBpwyM7OhJU811IGSWjoXJO0j6ZM1jMnMzAomT7L4WERs6lyIiGeBj9UuJDMzK5o8yaIpvfUOAElNwG61C8nMzIomT5vFj8gas7+Vlj+eyszMrEHkubM4E1gCfCJNdwCfr7aTpLGSlkhaLWmVpM+k8n0lLZb0cPq7TyqXpEsktUtaIWlK2bFmpO0fljSjLxdqZmZ9l2eI8peAy9LUG9uAz0XEA5L2BJZKWkz2vMYdEXG+pLOAs8gS0rHAhDQdms53qKR9gXOAEtl7u5dKWpjaTszMbADkGRvqiHQH8BtJj0h6NL0IqaKIeDIiHkjzzwMPAaOBabzyPu9rgOlpfhpwbWTuBVokjQSmAosjYmNKEIuBY3p5nWZmthPytFlcCfwjsJQ+vk5V0jhgMnAfMKLswb7fAiPS/Ghgbdlu61JZT+VdzzETmAnw+te/vi9hmplZD/K0WWyOiNsjYn1EPNM55T2BpD2Am4EzIuK58nUREWRVSzstIuZGRCkiSq2trf1xSDMzS/IkiyWS5kg6XNKUzinPwdNAgzcD10fE/FT8VKpeIv1dn8o7gLFlu49JZT2Vm5nZAMlTDXVo+ls+tEcAR1XaKT2bcSXwUERcWLZqITADOD/9vaWs/FOSbkjn3BwRT0paBHy1s9cU8B5gdo64zcysn+TpDfXOPh77CODDwEpJy1PZF8iSxDxJpwGPA+9P624DjgPagd8Dp6bzb5T0JeCXabsvRsTGPsZkZmZ9oKzZoMIG0gjgq8CoiDhW0oHA4RFx5UAE2BelUina2jwgrplZb0ha2tMAsXnaLK4GFgGj0vJvgDP6JzQzMxsM8iSL4RExD3gJICK20ccutGZmNjjlSRa/S2/KCwBJhwGbaxqVmZkVSp7eUJ8l66k0XtI9QCtwYk2jMjOzQsnTG+oBSe8ge7WqgDURsbXmkZmZWWHkeQf3R7oUTZFERFxbo5jMzKxg8lRDva1sfnfgaOABwMnCrAEtWNbBnEVreGLTFka1NDNr6kSmT37VcG02xOSphvp/5cvpfdw31CwiMyusBcs6mD1/JVu2Zh0iOzZtYfb8lQBOGENcnt5QXf0O2K+/AzGz4puzaM3LiaLTlq3bmbNoTZ0isoGSp83iB7wyMuwuwIHAvFoGZWbF9MSmLb0qt6EjT5vFv5XNbwMej4h1NYrHzApsVEszHd0khlEtzXWIxgZS1WqoiLirbLrHicKscc2aOpHmYU07lDUPa2LW1Il1isgGSp5qqOfp/gVFInt/0V79HpWZFVJnI7Z7QzWePNVQFwNPAteRJYiTgZER8S+1DMzMimn65NFODg0oT2+o4yPi0oh4PiKei4jLgGm1DszMzIoj70CCJ0tqkrSLpJPJus+amVmDyJMsPkT2Nrun0vS3qczMzBpEnt5Qj0XEtIgYHhGtETE9Ih6rtp+kqyStl/RgWdm5kjokLU/TcWXrZktql7RG0tSy8mNSWbuks/pwjWZmtpOqJgtJfyrpjs4vfUkHSzo7x7GvBo7ppvyiiJiUptvSMQ8ETgIOSvtcmqq9moD/AI4lexjwg2lbMzMbQHmqoa4AZgNbASJiBdkXe0UR8TNgY844pgE3RMSLEfEo0A4ckqb2iHgkIv5ANiaVG9fNzAZYnmTxRxFxf5eybTtxzk9JWpGqqfZJZaOBtWXbrEtlPZW/iqSZktoktW3YsGEnwjMzs67yJIunJY3nldeqnkj23EVfXAaMByalY1zQx+O8SkTMjYhSRJRaW1v767BmZka+h/JOB+YCB0jqAB4lezCv1yLiqc55SVcAt6bFDmBs2aZjUhkVys3MbIBUTBaSdgFKEfEuSX8M7BIRz/f1ZJJGRkTnXclfA509pRYC35V0ITAKmADcT/bE+ARJ+5EliZNwt10zswFXMVlExEuSPg/Mi4hePYgn6XvAkcBwSeuAc4AjJU0iq9J6DPh4Os8qSfOA1WTtIadHxPZ0nE8Bi4Am4KqIWNWbOMzMbOcporsxAss2kM4HngZupOzJ7YjI29NpwJVKpWhra6t3GGZmg4qkpRFR6m5dnjaLD6S/p5eVBbD/zgZmZmaDQ553cPsVqmZmDa7HrrOSvlo2/+6BCcfMzIqo0nMW5UN1fL3WgZiZWXHleSjPzMwaXKU2i9dJ+izZsw6d8y+LiAtrGpmZmRVGpWRxBbBnN/NmZtZgekwWEXHeQAZiZmbF5TYLMzOrysnCzMyqcrIwM7OqepUsJN1afSszMxtqentn0e1b6szMbGjrbbJYVpMozMys0HqVLCLi72sViJmZFZcbuM3MrConCzMzq6rSEOXXpb+fGbhwzMysiCrdWbxV0ijg7yXtI2nf8qnagSVdJWm9pAfLyvaVtFjSw+nvPqlcki6R1C5phaQpZfvMSNs/LGnGzlysmZn1TaVkcTlwB3AAsLTLlOcF11ez4zsxAM4C7oiICenYZ6XyY4EJaZoJXAZZcgHOAQ4FDgHO6UwwZmY2cHpMFhFxSUS8CbgqIvaPiP3Kpqrv346InwEbuxRPA65J89cA08vKr43MvUCLpJHAVGBxRGyMiGeBxbw6AZmZWY1VbeCOiE/04/lGRMSTaf63wIg0PxpYW7bdulTWU/mrSJopqU1S24YNG/oxZDMzq1tvqIgIIPrxeHMjohQRpdbW1v46rJmZMfDJ4qlUvUT6uz6VdwBjy7Ybk8p6KjczswE00MliIdDZo2kGcEtZ+UdSr6jDgM2pumoR8J7UG2sf4D2pzMzqZMGyDo44/072O+uHHHH+nSxY5t9vjaDSa1V3iqTvAUcCwyWtI+vVdD4wT9JpwOPA+9PmtwHHAe3A74FTASJio6QvAb9M230xIro2mtsQtmBZB3MWreGJTVsY1dLMrKkTmT7Z41nWy4JlHcyev5ItW7cD0LFpC7PnrwTw5zLEKWs6GFpKpVK0teXp3WtF1vWLCaB5WBNfO+HN/mKqkyPOv5OOTVteVT66pZl7zjqqDhFZf5K0NCJK3a3zcB9WWHMWrdkhUQBs2bqdOYvW1Ckie6KbRFGp3IYOJwsrLH8xFc+oluZeldvQ4WRhheUvpuKZNXUizcOadihrHtbErKkT6xSRDRQnCyssfzEVz/TJo/naCW9mdEszImurcBtSY6hZbyizndX5BeTeUMUyffJofwYNyMnCCs1fTGbF4GooMzOryncWVmh+KM+sGJwsrLD8tLBZcbgaygrLD+WZFYeThRWWH8ozKw4nCyssP5RnVhxOFlZYfijPrDjcwG2F5YfyzIrDycIKzQ/lmRWDq6HMzKwqJwszM6uqLslC0mOSVkpaLqktle0rabGkh9PffVK5JF0iqV3SCklT6hGzmVkjq+edxTsjYlLZK/zOAu6IiAnAHWkZ4FhgQppmApcNeKRmZg2uSA3c04Aj0/w1wE+BM1P5tZG9LPxeSS2SRkbEk3WJ0qzBebyuxlSvO4sAfixpqaSZqWxEWQL4LTAizY8G1pbtuy6V7UDSTEltkto2bNhQq7jNGlrneF0dm7YQvDJe14JlHfUOzWqsXncWb4+IDkmvAxZL+nX5yogISdGbA0bEXGAuQKlU6tW+Vlz+FVsslcbr8ucytNUlWURER/q7XtL3gUOApzqrlySNBNanzTuAsWW7j0llNsR51Nni8XhdjWvAq6Ek/bGkPTvngfcADwILgRlpsxnALWl+IfCR1CvqMGBzLdsrzl6wkvGzb2PcWT9k/OzbOHvBylqdyqrwqLPF4/G6Glc92ixGAD+X9CvgfuCHEfEj4Hzg3ZIeBt6VlgFuAx4B2oErgE/WKrCzF6zkO/f+L9sjq8XaHsF37v1fJ4w68a/Y4vF4XY1rwKuhIuIR4C3dlD8DHN1NeQCnD0BofO++tT2Wf3n6mwciBCszqqWZjm4Sg3/F1o/H62pcReo6W3eddxR5y622Zk2duEObBfhXbBF4vK7G5GRRpknqNjE0SXWIxvwr1qw4nCzKfPDQsXzn3v/tttzqw79izYrByaJMZ7vE9+5by/YImiQ+eOhYt1eYWcNTDMH6+FKpFG1tbfUOw8xsUJG0tGy8vh14iHIzM6vKycLMzKpysjAzs6qcLMzMrConCzMzq2pI9oaStAF4fCcPMxx4uh/CqTdfR/EMlWsZKtcBQ+dadvY63hARrd2tGJLJoj9IauupC9lg4usonqFyLUPlOmDoXEstr8PVUGZmVpWThZmZVeVk0bO59Q6gn/g6imeoXMtQuQ4YOtdSs+twm4WZmVXlOwszM6vKycLMzKpq6GQh6SpJ6yU92MN6SbpEUrukFZKmDHSMeeS4jiMlbZa0PE3/MtAx5iFprKQlklZLWiXpM91sM1g+kzzXUvjPRdLuku6X9Kt0Hed1s81rJN2YPpP7JI0b+Egry3kdp0jaUPZ5fLQeseYlqUnSMkm3drOu/z+TiGjYCfhLYArwYA/rjwNuBwQcBtxX75j7eB1HArfWO84c1zESmJLm9wR+Axw4SD+TPNdS+M8l/XfeI80PA+4DDuuyzSeBy9P8ScCN9Y67j9dxCvDNesfai2v6LPDd7v4fqsVn0tB3FhHxM2BjhU2mAddG5l6gRdLIgYkuvxzXMShExJMR8UCafx54COj6mrzB8pnkuZbCS/+dX0iLw9LUtVfMNOCaNH8TcLRUrHcR57yOQUPSGOC9wLd72KTfP5OGThY5jAbWli2vYxD+g08OT7fgt0s6qN7BVJNumyeT/QIsN+g+kwrXAoPgc0nVHcuB9cDiiOjxM4mIbcBm4LUDG2V1Oa4D4G9S9eZNkor8PuWLgc8DL/Wwvt8/EyeLxvAA2ZgvbwH+HVhQ53gqkrQHcDNwRkQ8V+94dkaVaxkUn0tEbI+IScAY4BBJf1bvmPoix3X8ABgXEQcDi3nll3mhSHofsD4ilg7keZ0sKusAyn9djEllg0pEPNd5Cx4RtwHDJA2vc1jdkjSM7Mv1+oiY380mg+YzqXYtg+lzAYiITcAS4Jguq17+TCTtCuwNPDOw0eXX03VExDMR8WJa/Dbw1oGOLacjgOMlPQbcABwl6Ttdtun3z8TJorKFwEdSD5zDgM0R8WS9g+otSX/SWV8p6RCyz71w/5hTjFcCD0XEhT1sNig+kzzXMhg+F0mtklrSfDPwbuDXXTZbCMxI8ycCd0ZqWS2KPNfRpe3reLJ2psKJiNkRMSYixpE1Xt8ZEX/XZbN+/0x23ZmdBztJ3yPrkTJc0jrgHLKGLyLicuA2st437cDvgVPrE2llOa7jROATkrYBW4CTivaPOTkC+DCwMtUtA3wBeD0Mrs+EfNcyGD6XkcA1kprIktm8iLhV0heBtohYSJYUr5PUTtbR4qT6hdujPNfxaUnHA9vIruOUukXbB7X+TDzch5mZVeVqKDMzq8rJwszMqnKyMDOzqpwszMysKicLMzOrysnChgxJ50r6vaTXlfMixYQAAAQbSURBVJW9UGmfXhx7nHoY1bc/pdFCf5JGPf1Arc9XJZbjJZ1VzxisOBr6OQsbkp4GPgecWe9AyknaNY3RU81kgDQsRV2l/voL6x2HFYPvLKxu0q/1X0u6WtJvJF0v6V2S7pH0cHqqubeuAj4gad9uzvVg2fI/STo3zf9U0kWS2iQ9JOltkuanGL5cdphdU4wPpYHm/ijt/1ZJd0laKmlR55PA6bgXS2oDdnifhaR9JS1Ig9bdK+ngdEf0HeBt6c5ifJd9csUp6e+UvbthuaRvpQfRkHRZ2neH9zlIekzSeZIekLRS0gGp/BRJ30zzVyt7j8h/S3pE0ompfBdJl6bPcbGk28rWna/sfR4rJP1bHz5LKxAnC6u3NwIXAAek6UPA24F/InviubdeIEsYr3rZUBV/iIgScDlwC3A68GfAKZI6R+ucCFwaEW8CngM+qWz8p38HToyIt6Zzf6XsuLtFRCkiLuhyvvOAZWnQui+QDbu+HvgocHdETIqI/+ltnJLeBHwAOCLdnWwHTk77/nPa92DgHZIOLjvu0xExBbiM7L99d0aSfTbvA85PZScA44ADyZ5YPxwg/Tf7a+CgdI1f7nowG1xcDWX19mhErASQtAq4IyJC0kqyL6G+uARY3stfs53VLSuBVZ3jTUl6hGxAtk3A2oi4J233HeDTwI/IvqwXKxvmqQkoH6vqxh7O93bgbwAi4s70Rb9XP8T5drIB8H6Z4mkmG5Ib4P2SZpL9ux9J9gW/Iq3rHOhwKVkC6M6CiHgJWC1pRNl1/Fcq/62kJal8M/D/gSuVvcntVW9zs8HFycLq7cWy+ZfKll+im/8/Jf0nWb3+ExFxXHcHjIhNkr5L9qu70zZ2vJPevYc4ymPoGkfXsXGC7A1sqyLi8O5iAX7XQ3lfVYtTwDURMbt8J0n7kd0xvC0inpV0NTv+N+g81nZ6/l4oP1/FF+lExLZUjXg02RhYnwKOqrSPFZuroWxQiYhTUxVNt4mizIXAx3nli+8p4HXpF/xryKpSeuv1kjqTwoeAnwNrgNbOcknDlO8lRneTqockHUlWDdQf7+64Azixs0dYaht5A7AXWeLanO4Kju2HcwHcQ/bCoF3ScY9M590D2DsNvf6PwFv66XxWJ76zsCEpIp6W9H2yLyoiYquyUTnvJxvrv+sw23msAU6XdBWwGrgsIv6QGnQvkbQ32b+pi4FVVY51LnCVpBVko+fOqLx5PhGxWtLZwI8l7QJsBU6PiHslLSO77rVkX/L94Wayu4fV6bgPkFVB7QncIml3sruQz/bT+axOPOqsme0USXtExAupUft+ssb139Y7LutfvrMws511q7IXC+0GfMmJYmjynYWZmVXlBm4zM6vKycLMzKpysjAzs6qcLMzMrConCzMzq+r/AMj9uXzHC0BgAAAAAElFTkSuQmCC\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"meanings = {'to':4,'the':4 ,'raha':2, 'diya':3, 'my':2, 'for':3, 'com':3, 'do':2, 'india':1,'time':1}\n",
"m = [meanings[lst[i][0]] for i in range(10)]\n",
"f = [lst[i][1] for i in range(10)] \n",
"plt.scatter(m, f)\n",
"plt.xlabel('m - Number of meanings')\n",
"plt.ylabel('f - Frequency of the token')\n",
"plt.title('Zipf\\'s Law')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3. Heaps' Law"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"vsize = 0\n",
"num_tokens = 0\n",
"unique_tokens = []\n",
"V = []\n",
"N = []\n",
"\n",
"for i in range(len(tokens)):\n",
" s = tokens[i]\n",
" if s not in unique_tokens:\n",
" unique_tokens.append(s)\n",
" vsize += 1\n",
" V.append(vsize)\n",
" N.append(i+1)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"N = np.array(N)\n",
"V = np.array(V)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Text(0.5, 1.0, \"Heaps' Law\")"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plt.plot(N, V)\n",
"plt.xlabel('N - Number of tokens')\n",
"plt.ylabel('|V| - Size of vocabulary')\n",
"plt.title('Heaps\\' Law')"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1000 610\n",
"10000 3597\n",
"20000 5920\n",
"30000 7753\n",
"50000 10806\n",
"100000 16675\n"
]
}
],
"source": [
"for i in [1000, 10000, 20000, 30000, 50000, 100000]:\n",
" print(i, V[i-1])"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"from math import log"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.3188109468386284\n",
"0.6824809994294719\n",
"1.3976147883124628\n",
"0.655343979207257\n",
"1.5589214096765935\n",
"0.6405478197637083\n"
]
}
],
"source": [
"print(8385/6358)\n",
"print(log(1.3188, 3/2))\n",
"print(11719/8385)\n",
"print(log(1.397614, 5/3))\n",
"print(18269/11719)\n",
"print(log(1.558921, 10/5))"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"10.340940789558791"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"11719 / 50000 ** 0.65"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Text(0.5, 1.0, \"Heaps' Law\")"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plt.plot(N, V)\n",
"\n",
"k = 10.34\n",
"beta = 0.64\n",
"plt.plot(N, k * (N**beta))\n",
"plt.xlabel('N - Number of tokens')\n",
"plt.ylabel('|V| - Size of vocabulary')\n",
"plt.title('Heaps\\' Law')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
h>0</th>\n",
" <td>@BubblyDentist @MeetUunngLee nahi nahi, mere s...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>going to the grammys first entertainment law b...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>bohot hi badiya ji aap sunao?</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Parvez Musharraf is Digvijay Singh of Pakistan...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>guddu ko bass john cena k sticker ki padii hai...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Aman ki maa ki... Asha https://twitter.com/ash...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>pakistan can wait more more and more . . . ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>@sagarcasm Jai Mahesh !!</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>RT AAP_Najafgarh \" RT AAPInNews : When DDCA lo...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>\"Kaam ho jayega, thoda kharcha paani lagega\" \\...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Sentence\n",
"0 @BubblyDentist @MeetUunngLee nahi nahi, mere s...\n",
"1 going to the grammys first entertainment law b...\n",
"2 bohot hi badiya ji aap sunao?\n",
"3 Parvez Musharraf is Digvijay Singh of Pakistan...\n",
"4 guddu ko bass john cena k sticker ki padii hai...\n",
"5 Aman ki maa ki... Asha https://twitter.com/ash...\n",
"6 pakistan can wait more more and more . . . ...\n",
"7 @sagarcasm Jai Mahesh !!\n",
"8 RT AAP_Najafgarh \" RT AAPInNews : When DDCA lo...\n",
"9 \"Kaam ho jayega, thoda kharcha paani lagega\" \\..."
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[:10]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# s = re.sub('[^0-9a-zA-Z]+', '*', s)\n",
"# https://stackoverflow.com/questions/12985456/replace-all-non-alphanumeric-characters-in-a-string"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def cleanText(raw_text):\n",
" '''\n",
" Convert a raw review to a cleaned review\n",
" '''\n",
" text = BeautifulSoup(raw_text, 'html').get_text() #remove html\n",
" words = text.split()\n",
" words = [w for w in words if '@' not in w and '#' not in w] # remove the @-words and #-words\n",
" text = ' '.join(words)\n",
" letters_only = re.sub('[^a-zA-Z]+', ' ', text) # remove non-character\n",
" \n",
" return( letters_only.lower())\n",
"\n",
"vclean = np.vectorize(cleanText)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Sentence</th>\n",
" <th>Cleaned sentence</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>@BubblyDentist @MeetUunngLee nahi nahi, mere s...</td>\n",
" <td>nahi nahi mere saath jaakar pachtaogi ye uunng...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>going to the grammys first entertainment law b...</td>\n",
" <td>going to the grammys first entertainment law b...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>bohot hi badiya ji aap sunao?</td>\n",
" <td>bohot hi badiya ji aap sunao</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Parvez Musharraf is Digvijay Singh of Pakistan...</td>\n",
" <td>parvez musharraf is digvijay singh of pakistan...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>guddu ko bass john cena k sticker ki padii hai...</td>\n",
" <td>guddu ko bass john cena k sticker ki padii hai...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Aman ki maa ki... Asha https://twitter.com/ash...</td>\n",
" <td>aman ki maa ki asha https twitter com ashabhos...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>pakistan can wait more more and more . . . ...</td>\n",
" <td>pakistan can wait more more and more aakhir pa...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>@sagarcasm Jai Mahesh !!</td>\n",
" <td>jai mahesh</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>RT AAP_Najafgarh \" RT AAPInNews : When DDCA lo...</td>\n",
" <td>rt aap najafgarh rt aapinnews when ddca lowere...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>\"Kaam ho jayega, thoda kharcha paani lagega\" \\...</td>\n",
" <td>kaam ho jayega thoda kharcha paani lagega sir...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Sentence \\\n",
"0 @BubblyDentist @MeetUunngLee nahi nahi, mere s... \n",
"1 going to the grammys first entertainment law b... \n",
"2 bohot hi badiya ji aap sunao? \n",
"3 Parvez Musharraf is Digvijay Singh of Pakistan... \n",
"4 guddu ko bass john cena k sticker ki padii hai... \n",
"5 Aman ki maa ki... Asha https://twitter.com/ash... \n",
"6 pakistan can wait more more and more . . . ... \n",
"7 @sagarcasm Jai Mahesh !! \n",
"8 RT AAP_Najafgarh \" RT AAPInNews : When DDCA lo... \n",
"9 \"Kaam ho jayega, thoda kharcha paani lagega\" \\... \n",
"\n",
" Cleaned sentence \n",
"0 nahi nahi mere saath jaakar pachtaogi ye uunng... \n",
"1 going to the grammys first entertainment law b... \n",
"2 bohot hi badiya ji aap sunao \n",
"3 parvez musharraf is digvijay singh of pakistan... \n",
"4 guddu ko bass john cena k sticker ki padii hai... \n",
"5 aman ki maa ki asha https twitter com ashabhos... \n",
"6 pakistan can wait more more and more aakhir pa... \n",
"7 jai mahesh \n",
"8 rt aap najafgarh rt aapinnews when ddca lowere... \n",
"9 kaam ho jayega thoda kharcha paani lagega sir... "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data['Cleaned sentence'] = vclean(data['Sentence'])\n",
"data[:10]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 nahi nahi mere saath jaakar pachtaogi ye uunng...\n",
"1 going to the grammys first entertainment law b...\n",
"2 bohot hi badiya ji aap sunao \n",
"3 parvez musharraf is digvijay singh of pakistan...\n",
"4 guddu ko bass john cena k sticker ki padii hai...\n",
"5 aman ki maa ki asha https twitter com ashabhos...\n",
"6 pakistan can wait more more and more aakhir pa...\n",
"7 jai mahesh \n",
"8 rt aap najafgarh rt aapinnews when ddca lowere...\n",
"9 kaam ho jayega thoda kharcha paani lagega sir...\n",
"Name: Cleaned sentence, dtype: object"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[:10]['Cleaned sentence']"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"def tokenize(s):\n",
" return tuple(s.split())"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of tokens: 303161\n",
"Number of word types: 32707\n"
]
}
],
"source": [
"tokens = []\n",
"for i in range(len(data)):\n",
" tokens.extend( tokenize( data.iloc[i]['Cleaned sentence']) )\n",
"sorted_tokens = sorted(tokens)\n",
"word_types = list(set(tokens))\n",
"print('Number of tokens:', len(tokens))\n",
"print('Number of word types:', len(word_types))"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TTR: 0.10788656852299604\n"
]
}
],
"source": [
"print('TTR:', len(word_types)/len(tokens))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2. Zipf's Law"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"token_count = {}\n",
"for s in sorted_tokens:\n",
" if s in token_count:\n",
" token_count[s] += 1\n",
" else:\n",
" token_count[s] = 1"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Top 50 Word types in decreasing order of number of occurences:\n",
"\n"
]
},
{
"data": {
"text/plain": [
"[('hai', 10030),\n",
" ('to', 4154),\n",
" ('ki', 3224),\n",
" ('ke', 3170),\n",
" ('nahi', 3169),\n",
" ('bhi', 2929),\n",
" ('the', 2866),\n",
" ('se', 2601),\n",
" ('ho', 2365),\n",
" ('ka', 2310),\n",
" ('bhai', 2266),\n",
" ('ko', 2208),\n",
" ('me', 1955),\n",
" ('ye', 1869),\n",
" ('kya', 1815),\n",
" ('hi', 1801),\n",
" ('aur', 1797),\n",
" ('twitter', 1760),\n",
" ('com', 1724),\n",
" ('kar', 1681),\n",
" ('i', 1509),\n",
" ('in', 1387),\n",
" ('t', 1319),\n",
" ('https', 1310),\n",
" ('is', 1296),\n",
" ('mein', 1276),\n",
" ('a', 1202),\n",
" ('ek', 1165),\n",
" ('and', 1126),\n",
" ('status', 1108),\n",
" ('of', 1074),\n",
" ('on', 1071),\n",
" ('na', 1026),\n",
" ('s', 1009),\n",
" ('ab', 969),\n",
" ('toh', 963),\n",
" ('rt', 944),\n",
" ('tha', 937),\n",
" ('http', 905),\n",
" ('for', 885),\n",
" ('you', 885),\n",
" ('aaj', 873),\n",
" ('co', 872),\n",
" ('raha', 868),\n",
" ('par', 826),\n",
" ('ne', 824),\n",
" ('aap', 820),\n",
" ('hain', 816),\n",
" ('koi', 802),\n",
" ('kuch', 801)]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted_token_count = list(token_count.items())\n",
"sorted_token_count.sort(key=lambda x:x[1], reverse=True)\n",
"print('Top 50 Word types in decreasing order of number of occurences:\\n')\n",
"sorted_token_count[:50]"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Text(0.5, 1.0, \"Zipf's Law\")"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"length = [len(item[0]) for item in sorted_token_count]\n",
"frequency = [item[1] for item in sorted_token_count]\n",
"plt.scatter(length, frequency)\n",
"\n",
"plt.xlabel('f - Frequency of token')\n",
"plt.ylabel('|r - rank of token')\n",
"plt.title('Zipf\\'s Law')"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"from random import choice"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('to', 4154),\n",
" ('bhi', 2929),\n",
" ('the', 2866),\n",
" ('https', 1310),\n",
" ('mein', 1276),\n",
" ('ek', 1165),\n",
" ('status', 1108),\n",
" ('on', 1071),\n",
" ('http', 905),\n",
" ('co', 872),\n",
" ('aap', 820),\n",
" ('log', 448)]"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"choose = sorted_token_count[:100]\n",
"lst = []\n",
"for i in range(10):\n",
" item = choice(choose)\n",
" choose.remove(item)\n",
" lst.append(item)\n",
"lst.sort(key=lambda x:x[1], reverse=True)\n",
"lst"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"meanings = {'to':3, 'bhi':2, 'the':3, 'https':1, 'mein':}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3. Heaps' Law"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"vsize = 0\n",
"num_tokens = 0\n",
"unique_tokens = []\n",
"V = []\n",
"N = []\n",
"\n",
"for i in range(len(tokens)):\n",
" s = tokens[i]\n",
" if s not in unique_tokens:\n",
" unique_tokens.append(s)\n",
" vsize += 1\n",
" V.append(vsize)\n",
" N.append(i+1)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"N = np.array(N)\n",
"V = np.array(V)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Text(0.5, 1.0, \"Heaps' Law\")"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAEWCAYAAABMoxE0AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3dd3zV1f3H8deHjWxkGPZUQBliBNyoLSJqsdYqOOvCOqq2tRVbq3b82tq6994D92hFERVQVEYYyoawiUDYhE2Sz++P74leaSCXJDc3N3k/H4/7uN97vuvz5YZ88j3nfM8xd0dERKS4qiQ7ABERSW1KJCIiUiJKJCIiUiJKJCIiUiJKJCIiUiJKJCIiUiJKJCIiUiJKJCKAmS0xsx/tUfYLMxufrJhCDHt90KuwmEWSQYlERERKRIlEJE5m1sLM3jSzNWa22Myui1nXx8y+MrONZrbSzB40sxox693MrjOzRWa21sz+bWZVwrpOZjbOzDaFda+WMM5GZvbfEOeGsNwqrDvRzGbEbDvazCbHfP7czM4syfml8lEiEYlD+KX/H+BroCVwMnCDmZ0SNskDfg00AY4K66/e4zA/BdKB3sBg4NJQ/lfgI6AR0Ap4oGAHd7dihFsFeAZoC7QBtgMPhnUTgM5m1sTMqgM9gBZmVs/Maof4Pi/GOaUSq5bsAETKkXfMLDfmcw1galg+Emjq7n8JnxeZ2RPAEGCUu0+J2W+JmT0GnADcG1N+h7uvB9ab2b3AUOBJYDfRL/0W7r4CKFG7jLuvA94s+Gxm/weMCeu2hzuQ44FviRLjRuAYYCewIOwvEjfdkYh870x3b1jw4od3FG2J/nLfWPAC/gA0BzCzg0MV0ioz2wz8nejuJNbymOWlQIuw/HvAgElmNsvMLqUEzOwAM3vMzJaGWD4DGppZ1bDJOKA/UTIZB4wlSnonhM8i+0WJRCQ+y4HFsYnG3eu5+6Cw/hFgLtDZ3esTJZk9q6Vaxyy3IbojwN1XufsV7t4CuBJ42Mw6lSDW3wKHAH1DLMeH8oJ49kwk41AikRJQIhGJzyQgx8xuMrPaZlbVzA4zsyPD+nrAZmCLmXUBrirkGL8LDeGtgeuBVwHM7OcFjeHABsCB/Djjqm5mtWJe1UIs24GNZtYYuG2Pfb4kSjR9gEnuPovojqsv0d2LyH5RIhGJg7vnAacDvYDFwFqi9o0GYZMbgfOAHOAJQpLYw7vAFGA68D7wVCg/EphoZluA94Dr3X1RnKGNJEoaBa/bidplaocYJwAf7nEtW4nafma5+65Q/BWw1N2z4zyvyHdME1uJJF54sLCzu2cmOxaR0qY7EhERKRElEhERKRFVbYmISInojkREREqk0j3Z3qRJE2/Xrl2ywxARSSlTpkxZ6+5NC1tX6RJJu3btyMjISHYYIiIpxcyW7m2dqrZERKRElEhERKRElEhERKRElEhERKRElEhERKRElEhERKRElEhERKRElEhERCq4FRu28cRni/gic21Cjl/pHkgUEakMMrO3MHr2aj6YuZJvVmwC4Kr+HTmm054zQJecEomISAWxKzefUbNW8cKEpUxavB6A7i0bcNPALpzWPY02Bx6QkPMqkYiIpLDcvHy+WLiO97/5lv9+s5Jtu/Jo0/gAbj61Cz/p1YK0BrUTHoMSiYhICpr97WZenLiUj2atYu2WXdSuXpUBhzZncK8W9D+4GVWqWJnFokQiIpIisnN28OKEZbz/zbcsXLOVmtWq8ONuzTm9Rxr9D2lGrepVkxKXEomISDm2Oy+fMXOzeWXSMsZnrmV3nnNc5yac37ctZ/VuScMDaiQ7RCUSEZHyKGvjdt6asoIRk5eTtXE7zerV5NJj2nPuka3p0LRussP7ASUSEZFywt35bMFanv9yCWPnryEv3+nTvjG3ntGNk7s0o1rV8vnonxKJiEiS7czN4z9fr+ThsZksWrOV5vVrctmx7TmvTxvaNamT7PCKpEQiIpIkC9ds4dXJy3ktYzkbt+2mW1p9/v7T7vzsiJbUrJachvPiUCIRESlDO3Pz+GRONo+MXciMrE1UrWIM6Nacc49szXGdm1K1DLvtlhYlEhGRMjBn5WaeGr+YD2euYsvOXFo0qMUtp3XlJz1b0Kx+rWSHVyJKJCIiCbJp+25GzVzFG1NWMGnJeurUqMqp3dMY1P0gju/ctNw2nu+vhCYSM6sFfAbUDOd6w91vM7P2wAjgQGAKcKG77zKzmsDzwBHAOuBcd18SjnUzcBmQB1zn7qNC+UDgPqAq8KS7/zOR1yQiUpTM7Bzu+XgBH85cRV6+0/bAAxh+aheGHNm6XDz3UdoSfUeyEzjJ3beYWXVgvJl9APwGuMfdR5jZo0QJ4pHwvsHdO5nZEOAO4Fwz6wYMAQ4FWgAfm9nB4RwPAT8GVgCTzew9d5+d4OsSEfmB3Lx8Pp6TzbNfLmbCouju49Jj2nFajxb0bNUAs9Rr+4hXQhOJuzuwJXysHl4OnAScF8qfA24nSiSDwzLAG8CDFv3rDwZGuPtOYLGZZQJ9wnaZ7r4IwMxGhG2VSESkTKzbspMRk5fzzBdLWLtlJwfVr8WNAw5maJ82HFi3ZrLDKxMJbyMxs6pE1VediO4eFgIb3T03bLICaBmWWwLLAdw918w2EVV/tQQmxBw2dp/le5T3TcBliIh8Jy/f+XjOal7PWMFn89ewKy+fYzs14YJ+h/GjruX3wcFESXgicfc8oJeZNQTeBrok+px7MrNhwDCANm3alPXpRaSCWLlpO29NzeKlCUv5dtMOmtaryYVHtWXIka3p3LxessNLmjLrteXuG81sDHAU0NDMqoW7klZAVtgsC2gNrDCzakADokb3gvICsfvsrTz23I8DjwOkp6d7qV2UiFR423flMW7+Gt6cuoJP5qwm3/l+2JKuzaleye4+CpPoXltNgd0hidQmahS/AxgDnE3Uc+ti4N2wy3vh81dh/afu7mb2HvCymd1N1NjeGZgEGNA59ALLImqQL2h7EREptuXrt/Hk54t4c2oWW3bmcmCdGlxxfAfOTS9/gyYmW6LvSNKA50I7SRXgNXf/r5nNBkaY2d+AacBTYfungBdCY/p6osSAu88ys9eIGtFzgWtClRlmdi0wiqj779PuPivB1yQiFZS780XmOp4cv4ix89ZgBmf0aME56a3p26Gx7j72wqKOVZVHenq6Z2RkJDsMESlHNm3bzetTlvPypGUsWrOVA+vU4Px+bTknvRWtGiVmnvNUY2ZT3D29sHV6sl1EKq3l67fxxOeLeHtqFjk7c+ndpiH/OrsHg3u1SKlBE5NNiUREKpUdu/P47zcreWPKciYsWk/1qsag7mlcfmwHurdqkOzwUpISiYhUCvNW5fDKpGW8Oz2LDdt207pxbW4ccDA/7d2Klg1rJzu8lKZEIiIV1sZtu3hrahZvTl3BrG83U72qMeDQgzi/TxuO6nhghR62pCwpkYhIhZKzYzdj563hlUnLmLBoHfkO3dLqc+vp3Rjcq0WlGbakLCmRiEiFMH91Dg9+msnIGSvJzXdaNqzN1f07MfCwgzispdo+EkmJRERSVn4Y82rE5OWMmZdNnRrVuKBfWwYedhBHtmuckrMNpiIlEhFJObvz8nn/m5U8OCaTzOwtNK1XkyuP78jlx7WniaquypwSiYikjGnLNvD2tCzenpZFzo5cDm5el/uHHs6gww6qdCPulidKJCJS7n2zYiP/HjWPzxespWa1Kgw49CAG92zBSV2aUUXVV0mnRCIi5dbcVZt5eMxC3vv6WxoeUJ0bBxzML45pT92a+tVVnujbEJFyZ9Li9Tzx+SJGz15NrepVuKp/R67u35F6taonOzQphBKJiJQL+fnOmHnZPDpuIZOXbKDhAdW59sROXHZsexrVqZHs8GQflEhEJKl27M7jnWlZvDRxGTOyNpHWoBa3nt6NoX3aULuGBk5MBUokIpIUm7bv5sUJS3nuyyVk5+ykfZM6/OvsHvykZwtqVVcCSSVKJCJSpjZs3cULE5byzBeL2bBtN8d1bsLd5/TimE4a+ypVKZGISJlYvn4b94yez3+/WcmuvHyO7dSEmwZ20dDtFYASiYgk1LxVOTw2biHvfv0t1asa5xzZigv7teOQg+olOzQpJUokIpIQo2ev5qExmUxfvpE6NapyYb+2XH5ce01dWwEpkYhIqZq8ZD33fbyA8Zlr6dC0Dn8Y1IWf9W6l4dsrMCUSESmx/HxnfOZaHvtsIV9krqNJ3ZrcfGoXLjmmPTWqaQysii6uRGJmvwJedPcNCY5HRFKIuzNyxioeGpPJ7JWbaVK3Jn8Y1IXz+7aljoYxqTTi/VOhOTDZzF4zs4EWRx89M2ttZmPMbLaZzTKz60P57WaWZWbTw2tQzD43m1mmmc0zs1NiygeGskwzGx5T3t7MJobyV81Mj7+KlIHcvHxGzVrF6Q+M55qXp5Kzczf/PrsHXww/kWHHd1QSqWTM3ePbMEoeA4BLgHTgNeApd1+4l+3TgDR3n2pm9YApwJnAOcAWd79zj+27Aa8AfYAWwMfAwWH1fODHwApgMjDU3Web2WvAW+4+wsweBb5290f2dR3p6emekZER1zWLyA+5O6Nnr+av789m+frttGl8AL86qRNn9W6lSaQqODOb4u7pha2L+88Gd3czWwWsAnKBRsAbZjba3X9fyPYrgZVhOcfM5gAt93GKwcAId98JLDazTKKkApDp7ovCxYwABofjnQScF7Z5Drgd2GciEZH9l5cfJZCnv1jMpMXr6dCkDg+d15sfd2uuNhCJu43keuAiYC3wJPA7d99tZlWABcD/JJI99m8HHA5MBI4BrjWzi4AM4Leh7aUlMCFmtxV8n3iW71HeFzgQ2OjuuYVsv+f5hwHDANq0aVP0BYsIADtz83hzShbPfrmY+au30Lx+TW4/oxtD+rTRMCbynXjvSBoBZ7n70thCd883s9P3taOZ1QXeBG5w981m9gjwV8DD+13Apfsd+X5w98eBxyGq2krkuUQqgm27cnl18nIeHruQNTk76ZpWn7vP6ckZPVtQXTMRyh6KTCRmVhUY4u63F7be3efsY9/qREnkJXd/K2y/Omb9E8B/w8csoHXM7q1CGXspXwc0NLNq4a4kdnsRKYZtu3J55oslPDpuITk7cunbvjF3n9OTYzs10ThYsldFJhJ3zws9ptq4+7J4Dxwa558C5rj73THlaaH9BOCnwMyw/B7wspndTdTY3hmYBBjQ2czaEyWKIcB5oc1mDHA2MAK4GHg33vhE5HuL1mzhsXGLeH/GSrbszOVHXZtz5QkdSG/bSAlEirQ/VVuzzGwSsLWg0N1/so99jgEuBGaY2fRQ9gdgqJn1IqraWgJcGY41K/TCmk3UmH+Nu+cBmNm1wCigKvC0u88Kx7sJGGFmfwOmESUuEYnT4rVbuf+TBbw9LYsa1arwk54tGHJka9LbNU52aJJC4ur+a2YnFFbu7uNKPaIEU/dfEcjauJ2Hx2QyYvJyqlc1LjqqHZcf155m9WolOzQpp0rc/TcVE4aI/K/szTt4ZNxCXpq4jPx8Z8iRrbn+5M40q68EIsUXb/fffsADQFegBlEV01Z3r5/A2ESklCxeu5UXvlrKaxnL2bYrl58e3opf/7izRuKVUhFvG8mDRI3crxM91X4R3z91LiLlVGb2Fh4em8k707IwMwZ1T+P6kzvRqZnmApHSsz9PtmeaWdXQAP6MmU0Dbk5caCJSXNk5O3hq/GKe/HwxBlxyTHuuPKGD2kAkIeJNJNvCgIjTzexfREOf6KkkkXJm3qocnh6/mHemZ7E7L58zerbgT6d3o4nmApEEijeRXEjULnIt8GuiBwR/lqigRCR+7s64+Wt4/LNFfLlwHbWqV+HMXi254vgOdGpWN9nhSSUQb6+tgqFRtgN/Tlw4IhKv/Hznk7nZPDI2k6nLNpLWoBY3DjiY8/u2pVEdzaggZWeficTMZhA9OFgod+9R6hGJyD7l5zsfzlrFPaPnsyB7C60a1eYvgw9laJ82GgdLkqKoO5J9DsgoImUnNy+fd6Z/y5OfL2Luqhw6NKnDXT/vyeBeLaimBCJJtM9EsudovyKSHJ/OXc0/P5jL/NVb6HJQPf51dg/OOrylEoiUC/E+kJjD91VcNYDq6IFEkYT7bP4a7v9kARlLN9C+SR3uG9KLn/RsoYEUpVyJt7H9u6eXwqi+g4F+iQpKpLLLWLKeez6ezxeZ62jRoBa3nt6NC/q11WyEUi7F/UBiAY9GeXzHzG4Dhpd+SCKVU8F86E9+vphJS9ZzYJ0a3HJaVy48qi01q2k2Qim/4q3aOivmYxWiYVJ2JCQikUpoxopN/OW/s5i8ZANpDWrxx0FdOb9fGw6osd9/64mUuXh/Ss+IWc4lmkdkcKlHI1LJLF23lQc+zeTNqSs4sE4N/nrmYQw9srUa0SWlxNtGckmiAxGpTL5evpGHx2by0ezVVDXj8mPbc+2JnWlwQPVkhyay3+Kt2uoA3EfUwO7AV8Cv3X1RAmMTqXCyc3bwz5FzeWtaFg0PqM7V/TtyQb+2pDWonezQRIot3qqtl4GHiOZYh2hI+VeAvokISqSi2b4rj6fGL+KRsQvZnef88oSOXH1iR+rX0h2IpL54E8kB7v5CzOcXzex3iQhIpCLZuG0Xz3yxhJcmLmXtll2c1KUZfxjUVYMpSoVS1FhbjcPiB2Y2HBhBVLV1LjAywbGJpKzsnB28MnE5T41fxOYduZzUpRnDju9Avw4HJjs0kVJX1B3JFKLEUfAY7ZUx6xxNbCXyA1t25vLsF4t5ZOxCtu7K4+QuzfjNgIM5tEWDZIcmkjBFjbXVviQHN7PWwPNAc6LE87i73xfudF4F2hF1JT7H3TeEp+bvAwYB24BfuPvUcKyLgVvCof/m7s+F8iOAZ4HaRHdJ14eHJkXKTG5ePi9NXMa9H89nw7bdnNSlGTef2oXOzTWlrVR8cT/tZGaHAd2A7+bqdPfni9gtF/itu081s3rAFDMbDfwC+MTd/xmqzIYDNwGnAp3Dqy/wCNA3JJ7biB6E9HCc99x9Q9jmCmAiUSIZCHwQ73WJlNS0ZRv407szmZm1mX4dGjP81K70at0w2WGJlJl4u//eBvQnSiQjiX7hjye629grd19JNC0v7p5jZnOAlkQPM/YPmz0HjCVKJIOB58MdxQQza2hmaWHb0e6+PsQzGhhoZmOB+u4+IZQ/D5yJEomUgayN2/m/92czcsYqmtStyQNDD+f0HmkaUFEqnXjvSM4GegLT3P0SM2sOvLg/JzKzdsDhRHcOzUOSAVhFVPUFUZJZHrPbilC2r/IVhZTvee5hwDCANm3a7E/YIv8jNy+ft6Zm8fcP5rBjdx7XntiJX/bvSN2aGs5EKqd4f/K3u3u+meWaWX0gm2je9riYWV3gTeAGd98c+xebu7uZJbRNw90fBx4HSE9PV/uJFNvSdVu5+qWpzPp2M4e3aci/z+5Bp2ZqB5HKLd5EkmFmDYEniHpybSF6ur1IZladKIm85O5vheLVZpbm7itD1VV2KM/ihwmqVSjL4vuqsILysaG8VSHbi5Sq3Lx8nv1yCf8aNY/a1aty35BenNGjBVWqqBpLJN6xtq4Oi4+a2YdE7RLfFLVf6IX1FDDH3e+OWfUecDHwz/D+bkz5tWY2gqixfVNINqOAv5tZo7DdAOBmd19vZpvNrB9RldlFwAPxXJNIvCYsWsef/zObOSs3c1znJvzjrO60anRAssMSKTfibWz/KfCpu29y9yWhEfxMd3+niF2PAS4EZpjZ9FD2B6IE8pqZXQYsBc4J60YSdf3NJOr+ewlASBh/BSaH7f5S0PAOXM333X8/QA3tUkqWrtvKPaPn8870b0lrUEuN6SJ7YfE8cmFm09291x5l09z98IRFliDp6emekZGR7DCkHNu2K5d7P17AM18sxjAuO64915/cmVrVNbmUVF5mNsXd0wtbF28bSWGTI6iLilQ4Y+Zm8+f/zGLJum2cfUQrbhxwCAc1qFX0jiKV2P40tt9NNAIwwDVEje4iFcLqzTu4Z/R8RkxeTqtGtXn58r4c3alJssMSSQnxJpJfAX8iGtYEYDRRMhFJaXn5zhOfL+Ke0fPJzXeGHd+B351yCNU1Q6FI3OLttbUVGB6GOXF335LYsEQS76uF6/jHB3P4ZsUmftS1GX86vRttD6yT7LBEUk68vba6Ew2H0jh8Xgtc7O4zExibSEKs27KTez6ez4sTltGyYW3u/HlPfta7pXpjiRRTvFVbjwG/cfcxAGbWn+hJ8aMTFJdIqdu2K5cnPlvMU+MXkbMzl0uOacfvT+lC7RrqjSVSEvEmkjoFSQTA3ceameoAJCW4O58vWMst78xk2fpt/Khrc24aeIiGeBcpJfEmkkVm9iegYLrdC4BFiQlJpPTMW5XD396fzecL1tKm8QG8fEVfju6o3lgipSneRHIp8GegYKysz0KZSLmUnbODf304jzenrqB+reoMP7ULlxzTjprVVI0lUtriTSTt3f26hEYiUgrcnXemZ3Hru7PYuTufy49tz1X9O9G4To1khyZSYcWbSO4ys4OAN4BX1VtLyqNVm3bwuze+5vMFa+ndpiH/OrsnnZrVTXZYIhVevM+RnBgSyTnAY2FOklfd/W8JjU4kDrvz8nnhq6Xc+/F8dubmc8tpXbnkmPZU1RDvImUi7vGy3H0VcL+ZjQF+D9wKKJFIUs3+djM3vv41s1dG86X//afd6dBUdyEiZSneBxK7AucCPwPWEQ2V8tsExiWyTzk7dnPnqHm8NHEZ9WtX59ELejPwsLRkhyVSKcV7R/I0MAI4xd2/TWA8IkV6e9oK/u/9uazdspOhfVrzu1O6qDFdJInibSM5KtGBiBQlO2cHv3/jG8bOW0PP1g158uJ0erVumOywRCo9zSki5V5uXj4vTFjKHR/OJd/hD4O6cNmxHdSYLlJOKJFIufbZ/DX8feQc5q7K4bjOTbjtjEPVpVeknNnnpAtm9kJ4v75swhGJ7Nidx63vzuSipyexbVce9w3pxfOX9lESESmHirojOcLMWgCXmtnzwA/qEtx9fcIik0pr/IK13PruTBat3cpFR7XlD4O6ar50kXKsqETyKPAJ0IFoat3YROKhXKRU5OU7j45byJ0fzaN1owN47tI+nHBw02SHJSJF2GfVlrvf7+5dgafdvYO7t495FZlEzOxpM8s2s5kxZbebWZaZTQ+vQTHrbjazTDObZ2anxJQPDGWZZjY8pry9mU0M5a+amfqApqiJi9ZxxgPj+feoeQzo1pxRNxyvJCKSIuKamNrdrzKznmZ2bXj1iPP4zwIDCym/x917hddIADPrBgwBDg37PGxmVc2sKvAQcCrQDRgatgW4IxyrE7ABuCzOuKSc2J2XzwOfLGDoExPYsG0XD553OI9ecIQmmxJJIXElEjO7DngJaBZeL5nZr4raz90/A+JtRxkMjHD3ne6+GMgE+oRXprsvcvddRA9GDrZoXtSTiAaSBHgOODPOc0k5kJmdw08e/IK7Rs/n1MPS+PCG4zm9RwtNeSuSYuLt/ns50NfdtwKY2R3AV8ADxTzvtWZ2EZAB/NbdNwAtgQkx26wIZQDL9yjvCxwIbHT33EK2/wEzGwYMA2jTpk0xQ5bSkpuXz5PjF3P3R/M5oGZVHjm/N6d21/AmIqkqrjsSokb2vJjPeezRg2s/PAJ0BHoBK4G7inmcuLn74+6e7u7pTZuq3j2ZZqzYxJkPf8E/P5jL8Qc35aMbjlcSEUlx8d6RPANMNLO3w+czgaeKc0J3X12wbGZPAP8NH7OA1jGbtgpl7KV8HdDQzKqFu5LY7aWccXdenrSMW9+dReM6NXhg6OGc3iNN1VgiFUC8Y23dbWZjgWND0SXuPq04JzSzNHdfGT7+FCjo0fUe8LKZ3Q20ADoDk4jufDqbWXuiRDEEOM/dPQxpfzZRu8nFwLvFiUkSa9GaLdz67izGZ67luM5NePC83jSoXT3ZYYlIKdmf+UimAlP35+Bm9grQH2hiZiuA24D+ZtaL6DmUJcCV4fizzOw1YDaQC1zj7nnhONcCo4CqRF2RZ4VT3ASMMLO/AdMo5l2SJEZuXj73f5rJo+MWUrNaFf78k0O5sF9bqmiMLJEKxdw92TGUqfT0dM/IyEh2GBXe6s07+O1rXzM+cy0DujXnr2ceRvP6tZIdlogUk5lNcff0wtZp0EYpdV8uXMt1r0wjZ0cud/ysO+ceqZ5yIhVZ3InEzNoCnd39YzOrDVRz95zEhSapxt157LNF3DlqHm0aH8BLl/fjkIPqJTssEUmweKfavYLoOYzGRF13WxGNw3Vy4kKTVBI76dQphzbnX2f3VIO6SCUR7x3JNURPmE8EcPcFZtYsYVFJShk7L5sbX/+azTty+eOgrlx2bHs1qItUIvEmkp3uvqugz7+ZVSPqdSWV2KZtu/nr+7N5Y8oKuhxUjxcu60vXtPrJDktEyli8iWScmf0BqG1mPwauBv6TuLCkvBszN5ub3vyGdVt38csTOnL9yZ010KJIJRVvIhlONLLuDKLnPka6+xMJi0rKrbVbdnLzWzMYPXs1nZrV5amLj6R7qwbJDktEkijeRHI+0ci83yUPMzvd3f+7j32kgvly4Vp++9rXrN+6i9+dcgiXHdteMxeKSNyDNj4AfG5mXWPK/pKAeKQccneeGr+YC56cSK3qVXnzqqO55sROSiIiAsR/R7KYqGrrDTO73d1fp/ij/0oK+Xbjdm55Zyafzs3m5C7NuG/o4dStqedYReR78f5GcHefamYnAK+YWV+ica+kApu+fCO/fGEKm3fs5qaBXbjy+A7q1isi/yPeqq2VAO6+FjiFqOvvYYkKSpLL3XktYznnPvYVVasYb151NFf176gkIiKFincY+dNilvOB34WXVDAbt+3iuhHT+Wz+Go5s14hHLjiCJnVrJjssESnH9plIzOxed7/BzP5DIQ8guvtPEhaZlLkJi9Zx4+tfk715J7ed0Y0L+7WlWtV4b1pFpLIq6o7khfB+Z6IDkeQaMWkZt7wzk7SGtXjx8r70ad842SGJSIrYZyJx9ynhfRyAmVUnahvJcvfsxIcnibYzN487Rx71KqgAABRySURBVM3jic8Xc2ynJjx8QW/q19JgiyISv33WW5jZo2Z2aFhuAHwNPA9MM7OhZRCfJNDaLTsZ+vgEnvh8Mef3bcNTv0hXEhGR/VZU1dZx7v7LsHwJMN/dzzSzg4APgFcSGp0kzKxvN3HFcxms3bqLB887nNN7tEh2SCKSoopKJLtiln8MvA7g7qsKRgKW1DN+wVqGvZBBg9rVefOXR2usLBEpkaISyUYzOx3IAo4herq9YBj52gmOTUqZu/Pw2IXc9dE8Ojaty3OX9qFFQ32NIlIyRSWSK4H7gYOAG9x9VSg/GXg/kYFJ6dqxO49b3pnJG1NWcFqPNP5xVne1h4hIqdhnY7u7z3f3ge7ey92fjSkf5e6/LergZva0mWWb2cyYssZmNtrMFoT3RqHczOx+M8s0s2/MrHfMPheH7ReY2cUx5UeY2Yywz/2m+rZCbdq+m6FPTOCNKSu49sROPDj0cCURESk1iX7a7Flg4B5lw4FP3L0z8En4DHAq0Dm8hgGPQJR4gNuAvkTT/d5WkHzCNlfE7LfnuSq95eu38fNHv2Rm1iYePO9wbjzlEJRvRaQ0JTSRuPtnwPo9igcDz4Xl54AzY8qf98gEoKGZpRGN7TXa3de7+wZgNDAwrKvv7hPc3Ym6JZ+JfGfasg389OEvWblpB8/8oo96ZolIQiRj/Ivm7r4yLK8CmofllsDymO1WhLJ9la8opPx/mNkwM8sws4w1a9aU/ApSwIczV3HOY19Rq3oV3rrqaI7t3CTZIYlIBbXficTMSm1WxHAn8T9jeJU2d3/c3dPdPb1p06aJPl1SuTv3fjyfX744hUMOqse71xxD5+b1kh2WiFRgxbkjKfSv/v2wOlRLEd4LhlrJAlrHbNcqlO2rvFUh5ZVWfr5zyzszuffjBZx1eEve+OXRHKiRe0UkwYqTSKaV8JzvAQU9ry4G3o0pvyj03uoHbApVYKOAAWbWKDSyDwBGhXWbzaxf6K11UcyxKh1350/vzuSlicsYdnwH7vx5T02FKyJlYr/nTHX3S+Pd1sxeAfoDTcxsBVHvq38Cr5nZZcBS4Jyw+UhgEJAJbCMakgV3X29mfwUmh+3+4u4FDfhXE/UMq000ZMsH+3s9FYG78/eRc3hp4jKuPKEDwwd2Uc8sESkzFjVTVB7p6emekZGR7DBKTX6+89f3Z/PMF0s4v28b/nbmYUoiIlLqzGyKu6cXtm6/70ik/MgLbSKvTFrGL45ux21ndFMSEZEyp0SSonLz8vnj2zN5NWM5V/XvyO/1oKGIJElRU+1eFOdxprv7N6UQj8Rhx+48fvPadEbOWMXV/Tvy+4Fdkh2SiFRiRd2RtI/zOEtKGIfEKWfHbi56ehLTlm3kpoFduKp/x2SHJCKVXFGJ5O/uvrtMIpEirduyk4ufmcSclTk8fH5vBnVPS3ZIIiJFPkeSZWZPmtnJGlk3uRav3cpZj3zJgtVbePzCI5RERKTcKCqRdCV6fuMWYLmZ3RceFpQylJm9hSGPf0XOjlxevqIvJ3dtXvROIiJlpKj5SNa5+2PufiLREO6LgHvMbKGZ/V+ZRFjJLV+/jUufnUxevvPKFf04om3jZIckIvIDcQ+R4u7fAk8RzQGSA1yeqKAksnbLTs597Cs2bNvFExelc8hBGnxRRMqfIhOJmdUys5+b2VtEw5ecRDQZlSa3SKBN23Zz2XMZrNu6ixcv68vhbRoVvZOISBIU9RzJy8CPgHHAS8B57r6jLAKrzDbv2M0FT01k7qrNPHReb3q2bpjskERE9qqo7r8fAle6e05ZBCPRw4bDns9g7qrNPHbhEZzURQ3rIlK+FZVIxgIFw7fvy0Z331w6IVVeu3LzueL5DCYuXs895/RSEhGRlFBUInmuiPUQzXD4LNGc6VJM7s5Nb37D5wvWcsfPunPm4SWdP0xEpGzsM5GEbr+SYO7OXR/N5+1pWfzmxwdz7pFtkh2SiEjcijNDopSyR8Yt5MExmZyT3opfndQp2eGIiOwXJZIkGzVrFf8eNY/TeqRxx896aCh4EUk5SiRJlJm9hV+/Op0erRpy59k9lUREJCUpkSTJhq27uOy5ydSsVoVHL+hN7RpVkx2SiEixaIbEJNiZm8eVL05h5cYdvHxFX9Ia1E52SCIixaZEkgT/GDmXSYvXc9+QXqS30yCMIpLaVLVVxsbNX8OzXy7hgn5tGNxLz4qISOpLWiIxsyVmNsPMpptZRihrbGajzWxBeG8Uys3M7jezTDP7xsx6xxzn4rD9AjO7OFnXE4/M7Byue2UahzSvx/BTuyY7HBGRUpHsO5IT3b2Xu6eHz8OBT9y9M/BJ+AxwKtA5vIYRDWWPmTUGbgP6Es2Xclscw7kkxcZtu7jsuQyqV63C4xcdQd2aqlUUkYoh2YlkT4P5fliW54AzY8qf98gEoKGZpQGnAKPdfb27bwBGAwPLOuii5Oc7v3nta7I2bOfh83vT9sA6yQ5JRKTUJDOROPCRmU0xs2GhrLm7rwzLq4CCUQtbAstj9l0RyvZW/gNmNszMMswsY82aNaV5DXF59sslfDo3mz+e1pU+7dW4LiIVSzLrV4519ywzawaMNrO5sSvd3c3MS+NE7v448DhAenp6qRwzXovWbOGfH87l5C7N+MXR7cry1CIiZSJpdyTunhXes4G3ido4VocqK8J7dtg8C2gds3urULa38nIhP9/507szqVG1Cv84q7ueXBeRCikpicTM6phZvYJlYAAwE3gPKOh5dTHwblh+D7go9N7qB2wKVWCjgAFmVjBnyoBQVi68MnkZX2Su4+ZBXWhWv1aywxERSYhkVW01B94Of6FXA1529w/NbDLwmpldBiwFzgnbjwQGEc0Zvw24BMDd15vZX4HJYbu/uPv6sruMvVu6biv/HDmXvu0bc14fDQsvIhVXUhKJuy8CehZSvg44uZByB67Zy7GeBp4u7RhLIj/fuX7EdDC48+cajFFEKjY9zJAAL05cyvTlG7n33F60bnxAssMREUmo8vYcScpbv3UXd46ax9EdD2RwrxbJDkdEJOGUSErZP0bOYfvuPG7/yaGq0hKRSkGJpBSNmZfN61NWcOmx7Tm4eb1khyMiUiaUSErJ9l153PL2TDo3q8uvf3RwssMRESkzamwvJY99tpCsjdsZMawftaprtkMRqTx0R1IKlq/fxqPjFjKo+0H063BgssMRESlTSiSl4O8j5wBwy2ndkhyJiEjZUyIpoclL1vPBzFVc3b8TLRpq7nURqXyUSEro7o/m06RuDa44rkOyQxERSQolkhL4fMEavlq0jqv6d6J2DTWwi0jlpERSTO7OXR/Np2XD2pzfV4MyikjlpURSTF9krmP68o1cc2IndfcVkUpNiaSYnvliMU3q1uRnR/zPzL4iIpWKEkkxLFidw6fzsjmvT2tqVtPdiIhUbkokxfDSxGVUq2L84pj2yQ5FRCTplEj207Zdubw5ZQWndU+jcZ0ayQ5HRCTplEj203vTvyVnZy4X9Gub7FBERMoFJZL99Na0LDo1q8sRbRslOxQRkXJBiWQ/rNq0g8lL1nNa9zRNWiUiEiiR7Id3p2fhDmf1VpdfEZECFSKRmNlAM5tnZplmNjxR5/lkTjbd0urT9sA6iTqFiEjKSflEYmZVgYeAU4FuwFAzK/Xx3LftymXqsg2ccEjT0j60iEhKS/lEAvQBMt19kbvvAkYAg0v7JNOWbSQ33+nbvnFpH1pEJKVVhETSElge83lFKPuOmQ0zswwzy1izZk2xTlKjWhV+1LUZPVs1LH6kIiIVUKWYs93dHwceB0hPT/fiHOPIdo05sp3uRkRE9lQR7kiygNYxn1uFMhERKQMVIZFMBjqbWXszqwEMAd5LckwiIpVGyldtuXuumV0LjAKqAk+7+6wkhyUiUmmkfCIBcPeRwMhkxyEiUhlVhKotERFJIiUSEREpESUSEREpESUSEREpEXMv1vN5KcvM1gBLi7l7E2BtKYaTTBXlWnQd5Yuuo/wprWtp6+6FDjZY6RJJSZhZhrunJzuO0lBRrkXXUb7oOsqfsrgWVW2JiEiJKJGIiEiJKJHsn8eTHUApqijXousoX3Qd5U/Cr0VtJCIiUiK6IxERkRJRIhERkRJRIomTmQ00s3lmlmlmw5MdTwEzW2JmM8xsupllhLLGZjbazBaE90ah3Mzs/nAN35hZ75jjXBy2X2BmF8eUHxGOnxn2tVKK+2kzyzazmTFlCY97b+co5eu43cyywncy3cwGxay7OcQ0z8xOiSkv9OcrTI8wMZS/GqZKwMxqhs+ZYX27El5HazMbY2azzWyWmV0fylPqO9nHdaTid1LLzCaZ2dfhWv5c3POX1jXulbvrVcSLaHj6hUAHoAbwNdAt2XGF2JYATfYo+xcwPCwPB+4Iy4OADwAD+gETQ3ljYFF4bxSWG4V1k8K2FvY9tZTiPh7oDcwsy7j3do5Svo7bgRsL2bZb+NmpCbQPP1NV9/XzBbwGDAnLjwJXheWrgUfD8hDg1RJeRxrQOyzXA+aHeFPqO9nHdaTid2JA3bBcHZgY/v326/yleY17jbU0filU9BdwFDAq5vPNwM3JjivEsoT/TSTzgLSwnAbMC8uPAUP33A4YCjwWU/5YKEsD5saU/2C7Uoi9HT/8BZzwuPd2jlK+jtsp/JfWD35uiObQOWpvP1/hF8laoNqeP4cF+4blamE7K8Xv5l3gx6n6nRRyHSn9nQAHAFOBvvt7/tK8xr29VLUVn5bA8pjPK0JZeeDAR2Y2xcyGhbLm7r4yLK8CmoflvV3HvspXFFKeKGUR997OUdquDVU+T8dU1ezvdRwIbHT33D3Kf3CssH5T2L7EQpXI4UR/Aafsd7LHdUAKfidmVtXMpgPZwGiiO4j9PX9pXmOhlEhS37Hu3hs4FbjGzI6PXenRnxQp18e7LOJO4DkeAToCvYCVwF0JOEdCmFld4E3gBnffHLsulb6TQq4jJb8Td89z915AK6AP0CXJIRVKiSQ+WUDrmM+tQlnSuXtWeM8G3ib6YVttZmkA4T07bL6369hXeatCyhOlLOLe2zlKjbuvDr8A8oEniL6T4lzHOqChmVXbo/wHxwrrG4Tti83MqhP98n3J3d8KxSn3nRR2Han6nRRw943AGKJqpv09f2leY6GUSOIzGegcejLUIGrIei/JMWFmdcysXsEyMACYSRRbQW+Zi4nqiQnlF4UeN/2ATaFKYRQwwMwahVv+AUR1oiuBzWbWL/SwuSjmWIlQFnHv7RylpuCXYvBTou+k4NxDQu+a9kBnogboQn++wl/nY4CzC4k39jrOBj4N2xc3ZgOeAua4+90xq1LqO9nbdaTod9LUzBqG5dpEbT1zinH+0rzGwpVWo1ZFfxH1UplPVEf5x2THE2LqQNTT4mtgVkFcRHWcnwALgI+BxqHcgIfCNcwA0mOOdSmQGV6XxJSnE/2nWwg8SOk1Hr5CVMWwm6gO9rKyiHtv5yjl63ghxPlN+E+cFrP9H0NM84jpAbe3n6/wHU8K1/c6UDOU1wqfM8P6DiW8jmOJqpS+AaaH16BU+072cR2p+J30AKaFmGcCtxb3/KV1jXt7aYgUEREpEVVtiYhIiSiRiIhIiSiRiIhIiSiRiIhIiSiRiIhIiSiRSMoyMzezu2I+32hmt+/nMZ61aFTYmuFzEzNbUkrx9Tez/5bGsYo4T9MwUus0Mztuj3U3mNkBcRxjS+IilIpOiURS2U7gLDNrUsLj5BE9+1CumFnVODc9GZjh7oe7++d7rLuBaMA/kYRRIpFUlks0H/WvS3ice4FfxwwJAfzvHYWZPWhmvwjLS8zsHxbmgTGz3mY2yswWmtkvYw5T38zet2jOh0fNrErYf4CZfWVmU83s9TA2VMFx7zCzqcDP94innZl9atHAg5+YWRsz60U0DPvgEEvtmO2vA1oAY8xsTCgbatGcIDPN7I49/yHCHdlXZnZa+Pw7M5sczvnnmDjmmNkTFs2T8VHBec3sOovmAvnGzEYU7+uQVKNEIqnuIeB8M2tQgmMsA8YDF+7vfh4NqPc58CzRkBL9gD/HbNMH+BXRnBAd+f4O6hbgRx4NuJkB/CZmn3Xu3tvd9/xF/ADwnLv3AF4C7nf36cCtRHNP9HL37QUbu/v9wLfAie5+opm1AO4ATiIavPBIMzuzYHszaw68T/QE9ftmNoBoOI0+Yfsj7PtBQTsDD7n7ocBG4GehfDhweIgxNqFKBVat6E1Eyi9332xmzwPXAduL2n4f/kE0ntD7+7FPwXhrM4gmIMoBcsxsZ8EYScAkd18EYGavEA3hsYMosXwRDQ1FDeCrmOO+upfzHQWcFZZfILoT2R9HAmPdfU2I5yWiibneIZo46RPgGncfF7YfEF7Twue6RAlkGbA4JDGAKURzskA0nMdLZvZOOK5UAkokUhHcSzTpzzOFrTSzUURzXGS4++WFbePuCyya9+GcmOJcfnjXXmuP3XaG9/yY5YLPBf+39hyDyInGqRrt7kMLvRrYupfyRMolSginAAWJxIB/uPtjsRtaNM9H7PXmAQVVaqcRJaczgD+aWXf/fl4LqaBUtSUpz93XE00Netle1p8Sqn0KTSIx/g+4MebzUqBbGDW1IVGj9v7qE0ZXrQKcS1SFNgE4xsw6wXejOB8cx7G+JBqhFeB8oiq1ouQQTTkL0SB8J4R2kKpEsxQWJA0n6nDQxcxuCmWjgEtj2m9amlmzvZ0oXGNrdx8D3EQ0jHndOGKUFKc7Eqko7gKuLckB3H1WaOTuHT4vN7PXiEZeXcz3VTz7YzLRSLediIbmftvd80Oj/SsF3Y6J2kzmF3GsXwHPmNnvgDXAJXGc/3HgQzP7NrSTDA9xGPC+u383PLi755nZUOA9M8tx94fNrCvwVaiC2wJcQHQHUpiqwIuhvcqI2nA2xhGjpDiN/isiIiWiqi0RESkRJRIRESkRJRIRESkRJRIRESkRJRIRESkRJRIRESkRJRIRESmR/wdnK1o3o/TkYAAAAABJRU5ErkJggg==\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plt.plot(N, V)\n",
"plt.xlabel('N - Number of tokens')\n",
"plt.ylabel('|V| - Size of vocabulary')\n",
"plt.title('Heaps\\' Law')"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1000 610\n",
"10000 3597\n",
"20000 5920\n",
"30000 7753\n",
"50000 10806\n",
"100000 16675\n"
]
}
],
"source": [
"for i in [1000, 10000, 20000, 30000, 50000, 100000]:\n",
" print(i, V[i-1])"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"from math import log"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.3188109468386284\n",
"0.6824809994294719\n",
"1.3976147883124628\n",
"0.655343979207257\n",
"1.5589214096765935\n",
"0.6405478197637083\n"
]
}
],
"source": [
"print(8385/6358)\n",
"print(log(1.3188, 3/2))\n",
"print(11719/8385)\n",
"print(log(1.397614, 5/3))\n",
"print(18269/11719)\n",
"print(log(1.558921, 10/5))"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"10.340940789558791"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"11719 / 50000 ** 0.65"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Text(0.5, 1.0, \"Heaps' Law\")"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plt.plot(N, V)\n",
"\n",
"k = 10.34\n",
"beta = 0.64\n",
"plt.plot(N, k * (N**beta))\n",
"plt.xlabel('N - Number of tokens')\n",
"plt.ylabel('|V| - Size of vocabulary')\n",
"plt.title('Heaps\\' Law')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment