Last active
June 6, 2019 16:16
-
-
Save rounakdatta/c2571f078a175a5ad94b3d5bbe8723b8 to your computer and use it in GitHub Desktop.
Hey there! I am doing Machine Learning.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def tokenizeSentence(mySentence, bow, allWords):\n", | |
" cleaned_mySentence = mySentence.replace(\".\", \"\").replace(\",\", \"\").replace(\"!\", \"\").replace(\"?\", \"\")\n", | |
" payload = cleaned_mySentence.split(\" \")\n", | |
" allWords = payload + allWords\n", | |
" \n", | |
" for token in payload:\n", | |
" token = token.lstrip().rstrip()\n", | |
" try:\n", | |
" bow[token] += 1\n", | |
" except:\n", | |
" bow[token] = 1\n", | |
" \n", | |
" return bow, allWords" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# calculation of term frequency\n", | |
"def calculateTF(bow, allWords):\n", | |
" tfDict = {}\n", | |
" counter = len(allWords)\n", | |
" for token, val in bow.items():\n", | |
" tfDict[token] = val/float(counter)\n", | |
" return tfDict" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# calculation of inverse document frequency\n", | |
"def computeIDF(docList):\n", | |
" import math\n", | |
" idfDict = {}\n", | |
" N = len(docList)\n", | |
" \n", | |
" idfDict = dict.fromkeys(docList[0].keys(), 0)\n", | |
" for doc in docList:\n", | |
" for word, val in doc.items():\n", | |
" if val > 0:\n", | |
" idfDict[word] += 1\n", | |
" \n", | |
" for word, val in idfDict.items():\n", | |
" idfDict[word] = math.log10(N / float(val))\n", | |
" \n", | |
" return idfDict" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def computeTFIDF(tfBow, idfs):\n", | |
" tfidf = {}\n", | |
" for word, val in tfBow.items():\n", | |
" tfidf[word] = val*idfs[word]\n", | |
" return tfidf" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"positiveReviews = os.listdir(\"./dataset/reviews/positive\")\n", | |
"negativeReviews = os.listdir(\"./dataset/reviews/negative\")\n", | |
"\n", | |
"bowPositive = {}\n", | |
"bowNegative = {}\n", | |
"\n", | |
"# we'll collect all the words here\n", | |
"allPositiveWords = []\n", | |
"allNegativeWords = []\n", | |
"\n", | |
"for pr in positiveReviews:\n", | |
" f = open(\"./dataset/reviews/positive/{}\".format(pr), \"r\")\n", | |
" for line in f.readlines():\n", | |
" bowPositive, allPositiveWords = tokenizeSentence(line, bowPositive, allPositiveWords)\n", | |
" \n", | |
"for nr in negativeReviews:\n", | |
" f = open(\"./dataset/reviews/negative/{}\".format(nr), \"r\")\n", | |
" for line in f.readlines():\n", | |
" bowNegative, allNegativeWords = tokenizeSentence(line, bowNegative, allNegativeWords)\n", | |
"\n", | |
"# set of all the words\n", | |
"allPositiveWords = list(set(allPositiveWords))\n", | |
"allNegativeWords = list(set(allNegativeWords))\n", | |
"\n", | |
"\n", | |
"foo = calculateTF(bowPositive, allPositiveWords)\n", | |
"bar = calculateTF(bowNegative, allNegativeWords)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"idfs = computeIDF([bowPositive, bowNegative])\n", | |
"\n", | |
"tfidfBowA = computeTFIDF(foo, idfs)\n", | |
"tfidfBowB = computeTFIDF(bar, idfs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# create the dataframe out of the given review data\n", | |
"import pandas as pd\n", | |
"import os\n", | |
"from sklearn.model_selection import train_test_split\n", | |
"\n", | |
"positiveReviews = os.listdir(\"./dataset/reviews/positive\")\n", | |
"negativeReviews = os.listdir(\"./dataset/reviews/negative\")\n", | |
"\n", | |
"allReviews = []\n", | |
"\n", | |
"for pr in positiveReviews:\n", | |
" f = open(\"./dataset/reviews/positive/{}\".format(pr), \"r\")\n", | |
" for line in f.readlines():\n", | |
" allReviews.append([line, 1])\n", | |
" \n", | |
"for nr in negativeReviews:\n", | |
" f = open(\"./dataset/reviews/negative/{}\".format(nr), \"r\")\n", | |
" for line in f.readlines():\n", | |
" allReviews.append([line, -1])\n", | |
" \n", | |
"\n", | |
"df = pd.DataFrame(allReviews, columns=[\"Review\", \"Litmus\"])\n", | |
"X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment