rounakdatta · June 6, 2019 16:16
diff --git a/abracadabra.ipynb b/abracadabra.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tokenizeSentence(mySentence, bow, allWords):\n",
    "    cleaned_mySentence = mySentence.replace(\".\", \"\").replace(\",\", \"\").replace(\"!\", \"\").replace(\"?\", \"\")\n",
    "    payload = cleaned_mySentence.split(\" \")\n",
    "    allWords = payload + allWords\n",
    "    \n",
    "    for token in payload:\n",
    "        token = token.lstrip().rstrip()\n",
    "        try:\n",
    "            bow[token] += 1\n",
    "        except:\n",
    "            bow[token] = 1\n",
    "    \n",
    "    return bow, allWords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculation of term frequency\n",
    "def calculateTF(bow, allWords):\n",
    "    tfDict = {}\n",
    "    counter = len(allWords)\n",
    "    for token, val in bow.items():\n",
    "        tfDict[token] = val/float(counter)\n",
    "    return tfDict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculation of inverse document frequency\n",
    "def computeIDF(docList):\n",
    "    import math\n",
    "    idfDict = {}\n",
    "    N = len(docList)\n",
    "    \n",
    "    idfDict = dict.fromkeys(docList[0].keys(), 0)\n",
    "    for doc in docList:\n",
    "        for word, val in doc.items():\n",
    "            if val > 0:\n",
    "                idfDict[word] += 1\n",
    "    \n",
    "    for word, val in idfDict.items():\n",
    "        idfDict[word] = math.log10(N / float(val))\n",
    "        \n",
    "    return idfDict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def computeTFIDF(tfBow, idfs):\n",
    "    tfidf = {}\n",
    "    for word, val in tfBow.items():\n",
    "        tfidf[word] = val*idfs[word]\n",
    "    return tfidf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "positiveReviews = os.listdir(\"./dataset/reviews/positive\")\n",
    "negativeReviews = os.listdir(\"./dataset/reviews/negative\")\n",
    "\n",
    "bowPositive = {}\n",
    "bowNegative = {}\n",
    "\n",
    "# we'll collect all the words here\n",
    "allPositiveWords = []\n",
    "allNegativeWords = []\n",
    "\n",
    "for pr in positiveReviews:\n",
    "    f = open(\"./dataset/reviews/positive/{}\".format(pr), \"r\")\n",
    "    for line in f.readlines():\n",
    "        bowPositive, allPositiveWords = tokenizeSentence(line, bowPositive, allPositiveWords)\n",
    "        \n",
    "for nr in negativeReviews:\n",
    "    f = open(\"./dataset/reviews/negative/{}\".format(nr), \"r\")\n",
    "    for line in f.readlines():\n",
    "        bowNegative, allNegativeWords = tokenizeSentence(line, bowNegative, allNegativeWords)\n",
    "\n",
    "# set of all the words\n",
    "allPositiveWords = list(set(allPositiveWords))\n",
    "allNegativeWords = list(set(allNegativeWords))\n",
    "\n",
    "\n",
    "foo = calculateTF(bowPositive, allPositiveWords)\n",
    "bar = calculateTF(bowNegative, allNegativeWords)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "idfs = computeIDF([bowPositive, bowNegative])\n",
    "\n",
    "tfidfBowA = computeTFIDF(foo, idfs)\n",
    "tfidfBowB = computeTFIDF(bar, idfs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create the dataframe out of the given review data\n",
    "import pandas as pd\n",
    "import os\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "positiveReviews = os.listdir(\"./dataset/reviews/positive\")\n",
    "negativeReviews = os.listdir(\"./dataset/reviews/negative\")\n",
    "\n",
    "allReviews = []\n",
    "\n",
    "for pr in positiveReviews:\n",
    "    f = open(\"./dataset/reviews/positive/{}\".format(pr), \"r\")\n",
    "    for line in f.readlines():\n",
    "        allReviews.append([line, 1])\n",
    "        \n",
    "for nr in negativeReviews:\n",
    "    f = open(\"./dataset/reviews/negative/{}\".format(nr), \"r\")\n",
    "    for line in f.readlines():\n",
    "        allReviews.append([line, -1])\n",
    "        \n",
    "\n",
    "df = pd.DataFrame(allReviews, columns=[\"Review\", \"Litmus\"])\n",
    "X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def tokenizeSentence(mySentence, bow, allWords):\n",
	" cleaned_mySentence = mySentence.replace(\".\", \"\").replace(\",\", \"\").replace(\"!\", \"\").replace(\"?\", \"\")\n",
	" payload = cleaned_mySentence.split(\" \")\n",
	" allWords = payload + allWords\n",
	" \n",
	" for token in payload:\n",
	" token = token.lstrip().rstrip()\n",
	" try:\n",
	" bow[token] += 1\n",
	" except:\n",
	" bow[token] = 1\n",
	" \n",
	" return bow, allWords"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# calculation of term frequency\n",
	"def calculateTF(bow, allWords):\n",
	" tfDict = {}\n",
	" counter = len(allWords)\n",
	" for token, val in bow.items():\n",
	" tfDict[token] = val/float(counter)\n",
	" return tfDict"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# calculation of inverse document frequency\n",
	"def computeIDF(docList):\n",
	" import math\n",
	" idfDict = {}\n",
	" N = len(docList)\n",
	" \n",
	" idfDict = dict.fromkeys(docList[0].keys(), 0)\n",
	" for doc in docList:\n",
	" for word, val in doc.items():\n",
	" if val > 0:\n",
	" idfDict[word] += 1\n",
	" \n",
	" for word, val in idfDict.items():\n",
	" idfDict[word] = math.log10(N / float(val))\n",
	" \n",
	" return idfDict"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def computeTFIDF(tfBow, idfs):\n",
	" tfidf = {}\n",
	" for word, val in tfBow.items():\n",
	" tfidf[word] = val*idfs[word]\n",
	" return tfidf"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import os\n",
	"positiveReviews = os.listdir(\"./dataset/reviews/positive\")\n",
	"negativeReviews = os.listdir(\"./dataset/reviews/negative\")\n",
	"\n",
	"bowPositive = {}\n",
	"bowNegative = {}\n",
	"\n",
	"# we'll collect all the words here\n",
	"allPositiveWords = []\n",
	"allNegativeWords = []\n",
	"\n",
	"for pr in positiveReviews:\n",
	" f = open(\"./dataset/reviews/positive/{}\".format(pr), \"r\")\n",
	" for line in f.readlines():\n",
	" bowPositive, allPositiveWords = tokenizeSentence(line, bowPositive, allPositiveWords)\n",
	" \n",
	"for nr in negativeReviews:\n",
	" f = open(\"./dataset/reviews/negative/{}\".format(nr), \"r\")\n",
	" for line in f.readlines():\n",
	" bowNegative, allNegativeWords = tokenizeSentence(line, bowNegative, allNegativeWords)\n",
	"\n",
	"# set of all the words\n",
	"allPositiveWords = list(set(allPositiveWords))\n",
	"allNegativeWords = list(set(allNegativeWords))\n",
	"\n",
	"\n",
	"foo = calculateTF(bowPositive, allPositiveWords)\n",
	"bar = calculateTF(bowNegative, allNegativeWords)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"idfs = computeIDF([bowPositive, bowNegative])\n",
	"\n",
	"tfidfBowA = computeTFIDF(foo, idfs)\n",
	"tfidfBowB = computeTFIDF(bar, idfs)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# create the dataframe out of the given review data\n",
	"import pandas as pd\n",
	"import os\n",
	"from sklearn.model_selection import train_test_split\n",
	"\n",
	"positiveReviews = os.listdir(\"./dataset/reviews/positive\")\n",
	"negativeReviews = os.listdir(\"./dataset/reviews/negative\")\n",
	"\n",
	"allReviews = []\n",
	"\n",
	"for pr in positiveReviews:\n",
	" f = open(\"./dataset/reviews/positive/{}\".format(pr), \"r\")\n",
	" for line in f.readlines():\n",
	" allReviews.append([line, 1])\n",
	" \n",
	"for nr in negativeReviews:\n",
	" f = open(\"./dataset/reviews/negative/{}\".format(nr), \"r\")\n",
	" for line in f.readlines():\n",
	" allReviews.append([line, -1])\n",
	" \n",
	"\n",
	"df = pd.DataFrame(allReviews, columns=[\"Review\", \"Litmus\"])\n",
	"X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}