Skip to content

Instantly share code, notes, and snippets.

View Venkatstatistics's full-sized avatar

Venkat Venkatstatistics

  • Aryma Labs
  • Bangalore
View GitHub Profile
@Venkatstatistics
Venkatstatistics / Recommender engine
Last active July 13, 2020 09:27
Recommender Engine - Under the hood
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
ds = pd.read_csv("test1.csv") #you can plug in your own list of products or movies or books here as csv file#
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
#ngram explanation begins#
#ngram (1,3) can be explained as follows#
#ngram(1,3) encompasses uni gram, bi gram and tri gram
#consider the sentence "The ball fell"
@Venkatstatistics
Venkatstatistics / Output
Created December 22, 2017 09:18
Recommender Engine Output
Recommending 2 books similar to The Elements of Statistical Learning
----------------------------------------------------------
You may also like to read: An introduction to Statistical Learning (score:0.389869522721)
You may also like to read: Statistical Distributions (score:0.13171009673)
ID,Book Title
1,Probabilistic Graphical Models
2,Bayesian Data Analysis
3,Doing data science
4,Pattern Recognition and Machine Learning
5,The Elements of Statistical Learning
6,An introduction to Statistical Learning
7,Python Machine Learning
8,Natural Langauage Processing with Python
9,Statistical Distributions
FROM rocker/r-ver:devel
RUN apt-get update && apt-get install -y \
sudo \
gdebi-core \
pandoc \
pandoc-citeproc \
libcurl4-gnutls-dev \
libcairo2-dev \
libxt-dev \
#Resume Phrase Matcher code
#importing all required libraries
import PyPDF2
import os
from os import listdir
from os.path import isfile, join
from io import StringIO
import csv
import sys
import pandas as pd
import numpy as np
from operator import itemgetter
import redis
#by default Redis runs on port 6379, the below is the url
REDIS_URL = "redis://localhost:6379/0"
r = redis.Redis(host='localhost', port=6379, db=0) #the object r is created
# we define a function to read the 200k words. The words are stored in a column called 'keyword'in the csv file 'big_Keywords'. The words are read one by one and stored
in a list are under the key "big_words".
def read_biglist():
biglist = pd.read_csv("big_Keywords.csv")
bigwords = biglist.keyword.tolist()
for token1 in bigwords:
r.lpush("big_keywords", token1) #LPUSH puts the new value at the start of the list.
def process():
import en_vectors_web_lg
nlp = en_vectors_web_lg.load()
topicdf = pd.read_csv("small_Topics.csv", encoding='Latin-1')
topics = topicdf.Topic.tolist()
while True:
big_keyword = r.lpop('big_keywords').decode('utf-8')
if not big_keyword:
def dump():
with open('results.csv', 'w') as f:
for key in r.lrange('results', 0, -1):
print(key)
f.write(key.decode('utf-8'))
f.write('\n')
if __name__== "__main__":
import time
start = time.time()
import csv
import sys
import pandas as pd
import numpy as np
from operator import itemgetter
import redis