Created
June 1, 2015 14:47
-
-
Save anirudh708/2cbc9bd7f2d09a7e3fc3 to your computer and use it in GitHub Desktop.
ipython notebook for ML excersise.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"<h1 align=\"center\">Similarity of Jobs</h1>\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"\n", | |
"\n", | |
"<h3> mongoDB dump was given and similarity of jobs based on jobId was to be found</h3>\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"* step 1 involved cleaning JSON replaced true with \"True\" false with \"False\"\n", | |
"\n", | |
"* converted Removed ISODATE time as it is not proper JSON format " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"`cat dataset.json | sed 's/ISODate(//g' | sed 's/)//g' > data.json`" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"* imported JSON file as dataframe." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# pandas library for handling data\n", | |
"import pandas as pd\n", | |
"\n", | |
"# reading data into a dataframe\n", | |
"df = pd.read_json(\"data.json\")\n", | |
"# copied to csv for exploratory processing.\n", | |
"df.to_csv(\"data1.csv\",encoding='utf-8')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Removed attributes which give no information or having all the values same." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"* the new file is processed_data.csv" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Index([u'Unnamed: 0', u'_id', u'badge', u'category', u'city', u'cmsJobId', u'country', u'dateCreated', u'department', u'experience', u'expiryDate', u'industry', u'postedDate', u'posterEmail', u'state', u'title', u'type', u'workLocation', u'description'], dtype='object')" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df = pd.read_csv(\"processed_data.csv\")\n", | |
"df.columns" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### function to convert description to words" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import re\n", | |
"import nltk\n", | |
"\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"\n", | |
"from bs4 import BeautifulSoup\n", | |
"from nltk.corpus import stopwords\n", | |
"def description_to_wordlist( row, remove_stopwords=False ):\n", | |
" # Function to convert a document to a sequence of words,\n", | |
" # optionally removing stop words. Returns a list of words.\n", | |
" #\n", | |
" # 1. Remove HTML\n", | |
" review = row['description']\n", | |
" review_text = BeautifulSoup(review).get_text()\n", | |
" #\n", | |
" # 2. Remove non-letters\n", | |
" review_text = re.sub(\"[^a-zA-Z]\",\" \", review_text)\n", | |
" #\n", | |
" # 3. Convert words to lower case and split them\n", | |
" words = review_text.lower().split()\n", | |
" #\n", | |
" # 4. Optionally remove stop words (false by default)\n", | |
" if remove_stopwords:\n", | |
" stops = set(stopwords.words(\"english\"))\n", | |
" words = [w for w in words if not w in stops]\n", | |
" #\n", | |
" # 5. Return a list of words\n", | |
" words[0] = words[0].replace(\"description\",\"\")\n", | |
" return words" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### add a new coloumn desc words which is a list of words obtained from description" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df['desc_words'] = df.apply(description_to_wordlist,axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#deleting columns with unique values = 1395\n", | |
"\n", | |
"df = df.drop(['Unnamed: 0','_id','country','experience'],1)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### forming TF-IDF matrix for description" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.feature_extraction.text import TfidfVectorizer\n", | |
"\n", | |
"ids = list(df['cmsJobId'])\n", | |
"desc_word_list = list(df['desc_words'])\n", | |
"sentences_desc = []\n", | |
"for each in desc_word_list:\n", | |
" sentences_desc.append(\" \".join(each))\n", | |
" \n", | |
"tfidf = TfidfVectorizer(sentences_desc)\n", | |
"tfs = tfidf.fit_transform(sentences_desc)\n", | |
"data_tfidf = pd.DataFrame(tfs.toarray(),index = ids)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### added index as cmsJobId. concatnated dataframe and tf-if matrix\n", | |
"* droped date values with thought that similarity of job does not depend on date\n", | |
"* not much infor available as dates are in 4/5 th months" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"df = df.set_index('cmsJobId')\n", | |
"\n", | |
"final_df = pd.concat([df,data_tfidf], axis=1)\n", | |
"final_df = final_df.drop(['dateCreated','expiryDate','postedDate','description','desc_words'],1)\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"badge object\n", | |
"category object\n", | |
"city object\n", | |
"department int64\n", | |
"industry int64\n", | |
"posterEmail int64\n", | |
"state object\n", | |
"title object\n", | |
"type int64\n", | |
"workLocation object\n", | |
"0 float64\n", | |
"1 float64\n", | |
"2 float64\n", | |
"3 float64\n", | |
"4 float64\n", | |
"...\n", | |
"3526 float64\n", | |
"3527 float64\n", | |
"3528 float64\n", | |
"3529 float64\n", | |
"3530 float64\n", | |
"3531 float64\n", | |
"3532 float64\n", | |
"3533 float64\n", | |
"3534 float64\n", | |
"3535 float64\n", | |
"3536 float64\n", | |
"3537 float64\n", | |
"3538 float64\n", | |
"3539 float64\n", | |
"3540 float64\n", | |
"Length: 3551, dtype: object" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# checking types of COLUMNS\n", | |
"final_df.dtypes" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### converting int64 dtype to string as department and all have to bve strings" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"final_df['department'] = final_df['department'].apply(str)\n", | |
"final_df['industry'] = final_df['industry'].apply(str)\n", | |
"final_df['posterEmail'] = final_df['posterEmail'].apply(str)\n", | |
"final_df['type'] = final_df['type'].apply(str)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### converting categorical columns to numrical ones." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"categorical_columns = final_df.columns[:10]\n", | |
"for each in categorical_columns:\n", | |
" final_df = pd.concat([final_df,pd.get_dummies(final_df[each])],axis=1)\n", | |
" final_df = final_df.drop(each,1)\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### construct a matrix of cosine similarity\n", | |
"* Would be a `1395 * 1395` metrix with each (i,j) representing similarity between i,j" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.metrics import pairwise_distances\n", | |
"from scipy.spatial.distance import cosine\n", | |
"X = 1-pairwise_distances(final_df, metric=\"cosine\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Top N elements based on similarity are" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Enter the CMS ID 31799918\n", | |
"enter N3\n", | |
"\n", | |
" the top 3+1 elements including itself are\n", | |
"\n", | |
"31799918\n", | |
"30967095\n", | |
"31932945\n", | |
"24334651\n" | |
] | |
} | |
], | |
"source": [ | |
"def f(a,N):\n", | |
" return np.argsort(a)[::-1][:N]\n", | |
"cmsJobIds = list(final_df.index)\n", | |
"try:\n", | |
" given_id_index = cmsJobIds.index(input(\"Enter the CMS ID \"))\n", | |
" N = input(\"enter N\")\n", | |
"\n", | |
" cms_sim = np.copy(X[given_id_index])\n", | |
" print \"\\n the top \"+str(N)+\"+1 elements including itself are\\n\"\n", | |
" for each in f(cms_sim,N+1):\n", | |
" print cmsJobIds[each]\n", | |
"except:\n", | |
" print \"pls try again\"\n", | |
" \n" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.9" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment