Last active
June 18, 2016 03:18
-
-
Save luthfihariz/c772c1965ecb14346532edd985d7e010 to your computer and use it in GitHub Desktop.
Using TF-IDF and Cosine Similiarity to find item to item similiarity based on its name and descriptions.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import time | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import linear_kernel | |
import json | |
start = time.time() | |
ds = pd.read_csv('query_result.csv') | |
# Train the engine. | |
# Create a TF-IDF matrix of unigrams, bigrams, and trigrams for each product. | |
# Then we compute similarity between all products using SciKit Leanr's linear_kernel (which in this case is equivalent to cosine similarity). | |
# Iterate through each item's similar items and store the 20 most-similar. | |
# Similarities and their scores are stored in new csv. | |
print ds | |
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0) | |
tfidf_matrix = tf.fit_transform(ds['item_description']) | |
print tfidf_matrix | |
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix) | |
similiarities = [] | |
ids = [] | |
for idx, row in ds.iterrows(): | |
similar_indices = cosine_similarities[idx].argsort()[:-20:-1] | |
similar_items = [(cosine_similarities[idx][i], ds['id'][i]) for i in similar_indices] | |
similiarities.append(json.dumps(similar_items[1:],())) | |
ids.append(ds['id'][idx]) | |
#print similiarities | |
d = {'id': ids,'similiar_items': similiarities} | |
dataFrame = pd.DataFrame(d) | |
dataFrame.to_csv('trained_data.csv', index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment