anirudh708 · June 1, 2015 14:47
diff --git a/Imomentous_ML_Excersise. b/Imomentous_ML_Excersise.
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h1 align=\"center\">Similarity of Jobs</h1>\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "\n",
    "<h3> mongoDB dump was given and similarity of jobs based on jobId was to be found</h3>\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* step 1 involved cleaning JSON replaced true with \"True\" false with \"False\"\n",
    "\n",
    "* converted Removed ISODATE time as it is not proper JSON format "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`cat dataset.json | sed 's/ISODate(//g' | sed 's/)//g' > data.json`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* imported JSON file as dataframe."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# pandas library for handling data\n",
    "import pandas as pd\n",
    "\n",
    "# reading data into a dataframe\n",
    "df = pd.read_json(\"data.json\")\n",
    "# copied to csv for exploratory processing.\n",
    "df.to_csv(\"data1.csv\",encoding='utf-8')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Removed attributes which give no information or having all the values same."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* the new file is processed_data.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index([u'Unnamed: 0', u'_id', u'badge', u'category', u'city', u'cmsJobId', u'country', u'dateCreated', u'department', u'experience', u'expiryDate', u'industry', u'postedDate', u'posterEmail', u'state', u'title', u'type', u'workLocation', u'description'], dtype='object')"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv(\"processed_data.csv\")\n",
    "df.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### function to convert description to words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import re\n",
    "import nltk\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from bs4 import BeautifulSoup\n",
    "from nltk.corpus import stopwords\n",
    "def description_to_wordlist( row, remove_stopwords=False ):\n",
    "        # Function to convert a document to a sequence of words,\n",
    "        # optionally removing stop words.  Returns a list of words.\n",
    "        #\n",
    "        # 1. Remove HTML\n",
    "        review = row['description']\n",
    "        review_text = BeautifulSoup(review).get_text()\n",
    "        #\n",
    "        # 2. Remove non-letters\n",
    "        review_text = re.sub(\"[^a-zA-Z]\",\" \", review_text)\n",
    "        #\n",
    "        # 3. Convert words to lower case and split them\n",
    "        words = review_text.lower().split()\n",
    "        #\n",
    "        # 4. Optionally remove stop words (false by default)\n",
    "        if remove_stopwords:\n",
    "            stops = set(stopwords.words(\"english\"))\n",
    "            words = [w for w in words if not w in stops]\n",
    "        #\n",
    "        # 5. Return a list of words\n",
    "        words[0] = words[0].replace(\"description\",\"\")\n",
    "        return words"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### add a new coloumn desc words which is a list of words obtained from description"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df['desc_words'] = df.apply(description_to_wordlist,axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#deleting columns with unique values = 1395\n",
    "\n",
    "df = df.drop(['Unnamed: 0','_id','country','experience'],1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### forming TF-IDF matrix for description"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "ids = list(df['cmsJobId'])\n",
    "desc_word_list = list(df['desc_words'])\n",
    "sentences_desc = []\n",
    "for each in desc_word_list:\n",
    "    sentences_desc.append(\" \".join(each))\n",
    "    \n",
    "tfidf = TfidfVectorizer(sentences_desc)\n",
    "tfs = tfidf.fit_transform(sentences_desc)\n",
    "data_tfidf = pd.DataFrame(tfs.toarray(),index = ids)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### added index as cmsJobId. concatnated dataframe and tf-if matrix\n",
    "* droped date values with thought that similarity of job does not depend on date\n",
    "* not much infor available as dates are in 4/5 th months"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df = df.set_index('cmsJobId')\n",
    "\n",
    "final_df = pd.concat([df,data_tfidf], axis=1)\n",
    "final_df = final_df.drop(['dateCreated','expiryDate','postedDate','description','desc_words'],1)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "badge            object\n",
       "category         object\n",
       "city             object\n",
       "department        int64\n",
       "industry          int64\n",
       "posterEmail       int64\n",
       "state            object\n",
       "title            object\n",
       "type              int64\n",
       "workLocation     object\n",
       "0               float64\n",
       "1               float64\n",
       "2               float64\n",
       "3               float64\n",
       "4               float64\n",
       "...\n",
       "3526    float64\n",
       "3527    float64\n",
       "3528    float64\n",
       "3529    float64\n",
       "3530    float64\n",
       "3531    float64\n",
       "3532    float64\n",
       "3533    float64\n",
       "3534    float64\n",
       "3535    float64\n",
       "3536    float64\n",
       "3537    float64\n",
       "3538    float64\n",
       "3539    float64\n",
       "3540    float64\n",
       "Length: 3551, dtype: object"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# checking types of COLUMNS\n",
    "final_df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### converting int64 dtype to string as department and all have to bve strings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "final_df['department'] = final_df['department'].apply(str)\n",
    "final_df['industry'] = final_df['industry'].apply(str)\n",
    "final_df['posterEmail'] = final_df['posterEmail'].apply(str)\n",
    "final_df['type'] = final_df['type'].apply(str)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### converting categorical columns to numrical ones."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "categorical_columns = final_df.columns[:10]\n",
    "for each in categorical_columns:\n",
    "    final_df =  pd.concat([final_df,pd.get_dummies(final_df[each])],axis=1)\n",
    "    final_df =  final_df.drop(each,1)\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### construct a matrix of cosine similarity\n",
    "* Would be a `1395 * 1395` metrix with each (i,j) representing similarity between i,j"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.metrics import pairwise_distances\n",
    "from scipy.spatial.distance import cosine\n",
    "X = 1-pairwise_distances(final_df, metric=\"cosine\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Top N elements based on similarity are"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Enter the CMS ID 31799918\n",
      "enter N3\n",
      "\n",
      " the top 3+1 elements including itself are\n",
      "\n",
      "31799918\n",
      "30967095\n",
      "31932945\n",
      "24334651\n"
     ]
    }
   ],
   "source": [
    "def f(a,N):\n",
    "    return np.argsort(a)[::-1][:N]\n",
    "cmsJobIds = list(final_df.index)\n",
    "try:\n",
    "    given_id_index = cmsJobIds.index(input(\"Enter the CMS ID \"))\n",
    "    N = input(\"enter N\")\n",
    "\n",
    "    cms_sim = np.copy(X[given_id_index])\n",
    "    print \"\\n the top \"+str(N)+\"+1 elements including itself are\\n\"\n",
    "    for each in f(cms_sim,N+1):\n",
    "        print cmsJobIds[each]\n",
    "except:\n",
    "    print \"pls try again\"\n",
    "        \n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"<h1 align=\"center\">Similarity of Jobs</h1>\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"\n",
	"\n",
	"<h3> mongoDB dump was given and similarity of jobs based on jobId was to be found</h3>\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"* step 1 involved cleaning JSON replaced true with \"True\" false with \"False\"\n",
	"\n",
	"* converted Removed ISODATE time as it is not proper JSON format "
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"`cat dataset.json \| sed 's/ISODate(//g' \| sed 's/)//g' > data.json`"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"* imported JSON file as dataframe."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# pandas library for handling data\n",
	"import pandas as pd\n",
	"\n",
	"# reading data into a dataframe\n",
	"df = pd.read_json(\"data.json\")\n",
	"# copied to csv for exploratory processing.\n",
	"df.to_csv(\"data1.csv\",encoding='utf-8')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Removed attributes which give no information or having all the values same."
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"* the new file is processed_data.csv"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"Index([u'Unnamed: 0', u'_id', u'badge', u'category', u'city', u'cmsJobId', u'country', u'dateCreated', u'department', u'experience', u'expiryDate', u'industry', u'postedDate', u'posterEmail', u'state', u'title', u'type', u'workLocation', u'description'], dtype='object')"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df = pd.read_csv(\"processed_data.csv\")\n",
	"df.columns"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### function to convert description to words"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import re\n",
	"import nltk\n",
	"\n",
	"import pandas as pd\n",
	"import numpy as np\n",
	"\n",
	"from bs4 import BeautifulSoup\n",
	"from nltk.corpus import stopwords\n",
	"def description_to_wordlist( row, remove_stopwords=False ):\n",
	" # Function to convert a document to a sequence of words,\n",
	" # optionally removing stop words. Returns a list of words.\n",
	" #\n",
	" # 1. Remove HTML\n",
	" review = row['description']\n",
	" review_text = BeautifulSoup(review).get_text()\n",
	" #\n",
	" # 2. Remove non-letters\n",
	" review_text = re.sub(\"[^a-zA-Z]\",\" \", review_text)\n",
	" #\n",
	" # 3. Convert words to lower case and split them\n",
	" words = review_text.lower().split()\n",
	" #\n",
	" # 4. Optionally remove stop words (false by default)\n",
	" if remove_stopwords:\n",
	" stops = set(stopwords.words(\"english\"))\n",
	" words = [w for w in words if not w in stops]\n",
	" #\n",
	" # 5. Return a list of words\n",
	" words[0] = words[0].replace(\"description\",\"\")\n",
	" return words"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### add a new coloumn desc words which is a list of words obtained from description"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"df['desc_words'] = df.apply(description_to_wordlist,axis=1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#deleting columns with unique values = 1395\n",
	"\n",
	"df = df.drop(['Unnamed: 0','_id','country','experience'],1)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### forming TF-IDF matrix for description"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from sklearn.feature_extraction.text import TfidfVectorizer\n",
	"\n",
	"ids = list(df['cmsJobId'])\n",
	"desc_word_list = list(df['desc_words'])\n",
	"sentences_desc = []\n",
	"for each in desc_word_list:\n",
	" sentences_desc.append(\" \".join(each))\n",
	" \n",
	"tfidf = TfidfVectorizer(sentences_desc)\n",
	"tfs = tfidf.fit_transform(sentences_desc)\n",
	"data_tfidf = pd.DataFrame(tfs.toarray(),index = ids)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### added index as cmsJobId. concatnated dataframe and tf-if matrix\n",
	"* droped date values with thought that similarity of job does not depend on date\n",
	"* not much infor available as dates are in 4/5 th months"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"df = df.set_index('cmsJobId')\n",
	"\n",
	"final_df = pd.concat([df,data_tfidf], axis=1)\n",
	"final_df = final_df.drop(['dateCreated','expiryDate','postedDate','description','desc_words'],1)\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"badge object\n",
	"category object\n",
	"city object\n",
	"department int64\n",
	"industry int64\n",
	"posterEmail int64\n",
	"state object\n",
	"title object\n",
	"type int64\n",
	"workLocation object\n",
	"0 float64\n",
	"1 float64\n",
	"2 float64\n",
	"3 float64\n",
	"4 float64\n",
	"...\n",
	"3526 float64\n",
	"3527 float64\n",
	"3528 float64\n",
	"3529 float64\n",
	"3530 float64\n",
	"3531 float64\n",
	"3532 float64\n",
	"3533 float64\n",
	"3534 float64\n",
	"3535 float64\n",
	"3536 float64\n",
	"3537 float64\n",
	"3538 float64\n",
	"3539 float64\n",
	"3540 float64\n",
	"Length: 3551, dtype: object"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# checking types of COLUMNS\n",
	"final_df.dtypes"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### converting int64 dtype to string as department and all have to bve strings"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"final_df['department'] = final_df['department'].apply(str)\n",
	"final_df['industry'] = final_df['industry'].apply(str)\n",
	"final_df['posterEmail'] = final_df['posterEmail'].apply(str)\n",
	"final_df['type'] = final_df['type'].apply(str)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### converting categorical columns to numrical ones."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"categorical_columns = final_df.columns[:10]\n",
	"for each in categorical_columns:\n",
	" final_df = pd.concat([final_df,pd.get_dummies(final_df[each])],axis=1)\n",
	" final_df = final_df.drop(each,1)\n",
	" "
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### construct a matrix of cosine similarity\n",
	"* Would be a `1395 * 1395` metrix with each (i,j) representing similarity between i,j"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from sklearn.metrics import pairwise_distances\n",
	"from scipy.spatial.distance import cosine\n",
	"X = 1-pairwise_distances(final_df, metric=\"cosine\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Top N elements based on similarity are"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Enter the CMS ID 31799918\n",
	"enter N3\n",
	"\n",
	" the top 3+1 elements including itself are\n",
	"\n",
	"31799918\n",
	"30967095\n",
	"31932945\n",
	"24334651\n"
	]
	}
	],
	"source": [
	"def f(a,N):\n",
	" return np.argsort(a)[::-1][:N]\n",
	"cmsJobIds = list(final_df.index)\n",
	"try:\n",
	" given_id_index = cmsJobIds.index(input(\"Enter the CMS ID \"))\n",
	" N = input(\"enter N\")\n",
	"\n",
	" cms_sim = np.copy(X[given_id_index])\n",
	" print \"\\n the top \"+str(N)+\"+1 elements including itself are\\n\"\n",
	" for each in f(cms_sim,N+1):\n",
	" print cmsJobIds[each]\n",
	"except:\n",
	" print \"pls try again\"\n",
	" \n"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.9"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}