Skip to content

Instantly share code, notes, and snippets.

@rounakdatta
Last active June 6, 2019 16:16
Show Gist options
  • Save rounakdatta/c2571f078a175a5ad94b3d5bbe8723b8 to your computer and use it in GitHub Desktop.
Save rounakdatta/c2571f078a175a5ad94b3d5bbe8723b8 to your computer and use it in GitHub Desktop.
Hey there! I am doing Machine Learning.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def tokenizeSentence(mySentence, bow, allWords):\n",
" cleaned_mySentence = mySentence.replace(\".\", \"\").replace(\",\", \"\").replace(\"!\", \"\").replace(\"?\", \"\")\n",
" payload = cleaned_mySentence.split(\" \")\n",
" allWords = payload + allWords\n",
" \n",
" for token in payload:\n",
" token = token.lstrip().rstrip()\n",
" try:\n",
" bow[token] += 1\n",
" except:\n",
" bow[token] = 1\n",
" \n",
" return bow, allWords"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# calculation of term frequency\n",
"def calculateTF(bow, allWords):\n",
" tfDict = {}\n",
" counter = len(allWords)\n",
" for token, val in bow.items():\n",
" tfDict[token] = val/float(counter)\n",
" return tfDict"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# calculation of inverse document frequency\n",
"def computeIDF(docList):\n",
" import math\n",
" idfDict = {}\n",
" N = len(docList)\n",
" \n",
" idfDict = dict.fromkeys(docList[0].keys(), 0)\n",
" for doc in docList:\n",
" for word, val in doc.items():\n",
" if val > 0:\n",
" idfDict[word] += 1\n",
" \n",
" for word, val in idfDict.items():\n",
" idfDict[word] = math.log10(N / float(val))\n",
" \n",
" return idfDict"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def computeTFIDF(tfBow, idfs):\n",
" tfidf = {}\n",
" for word, val in tfBow.items():\n",
" tfidf[word] = val*idfs[word]\n",
" return tfidf"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"positiveReviews = os.listdir(\"./dataset/reviews/positive\")\n",
"negativeReviews = os.listdir(\"./dataset/reviews/negative\")\n",
"\n",
"bowPositive = {}\n",
"bowNegative = {}\n",
"\n",
"# we'll collect all the words here\n",
"allPositiveWords = []\n",
"allNegativeWords = []\n",
"\n",
"for pr in positiveReviews:\n",
" f = open(\"./dataset/reviews/positive/{}\".format(pr), \"r\")\n",
" for line in f.readlines():\n",
" bowPositive, allPositiveWords = tokenizeSentence(line, bowPositive, allPositiveWords)\n",
" \n",
"for nr in negativeReviews:\n",
" f = open(\"./dataset/reviews/negative/{}\".format(nr), \"r\")\n",
" for line in f.readlines():\n",
" bowNegative, allNegativeWords = tokenizeSentence(line, bowNegative, allNegativeWords)\n",
"\n",
"# set of all the words\n",
"allPositiveWords = list(set(allPositiveWords))\n",
"allNegativeWords = list(set(allNegativeWords))\n",
"\n",
"\n",
"foo = calculateTF(bowPositive, allPositiveWords)\n",
"bar = calculateTF(bowNegative, allNegativeWords)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"idfs = computeIDF([bowPositive, bowNegative])\n",
"\n",
"tfidfBowA = computeTFIDF(foo, idfs)\n",
"tfidfBowB = computeTFIDF(bar, idfs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# create the dataframe out of the given review data\n",
"import pandas as pd\n",
"import os\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"positiveReviews = os.listdir(\"./dataset/reviews/positive\")\n",
"negativeReviews = os.listdir(\"./dataset/reviews/negative\")\n",
"\n",
"allReviews = []\n",
"\n",
"for pr in positiveReviews:\n",
" f = open(\"./dataset/reviews/positive/{}\".format(pr), \"r\")\n",
" for line in f.readlines():\n",
" allReviews.append([line, 1])\n",
" \n",
"for nr in negativeReviews:\n",
" f = open(\"./dataset/reviews/negative/{}\".format(nr), \"r\")\n",
" for line in f.readlines():\n",
" allReviews.append([line, -1])\n",
" \n",
"\n",
"df = pd.DataFrame(allReviews, columns=[\"Review\", \"Litmus\"])\n",
"X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment