Skip to content

Instantly share code, notes, and snippets.

View kshirsagarsiddharth's full-sized avatar
🎯
Focusing

kshirsagarsiddharth

🎯
Focusing
View GitHub Profile
location_filtered = users.filter(F.col("location").rlike(r"india|usa|china"))
filtered_with_location = location_filtered.join(combined_and_filtered, on = 'user_id', how = 'inner')\
.select('user_id','isbn','book_rating','book_title')\
.withColumn('user_id',F.col('user_id').cast('int'))
ratings.createOrReplaceTempView('ratings')
books.createOrReplaceTempView('books')
combined_dataframe = spark.sql("""
SELECT user_id,r.isbn,book_rating,book_title
FROM ratings r INNER JOIN books b
ON r.isbn = b.isbn
""")
# we only take those books for which the 15 or more users have rated and those books which have 50 or more rating
temp_df = (users.groupBy('location')
.count()
.select('location',F.col('count').cast('int'))
.toPandas()
.sort_values(by = 'count'))
sns.distplot(temp_df['count'])
plt.title('Location')
plt.show()
temp_df = (users.groupBy('age')
.count()
.select('age',F.col('count').cast('int'))
.toPandas()
.sort_values(by = 'count'))
sns.distplot(temp_df['count'])
plt.title('age')
plt.show()
## lets visualize some key parameters
import pyspark.sql.functions as F
import seaborn as sns
temp_df = (ratings.groupBy('book_rating')
.count()
.select('book_rating',F.col('count').cast('int'))
.toPandas()
.sort_values(by = 'count'))
sns.barplot(x = 'book_rating', y = 'count', data = temp_df)
books = books.withColumnRenamed("ISBN","isbn") \
.withColumnRenamed("Book-Title","book_title") \
.withColumnRenamed('Book-Author','book_author') \
users = users.withColumnRenamed('User-ID','user_id')\
.withColumnRenamed('Location','location')\
.withColumnRenamed('Age','age')
ratings = ratings.withColumnRenamed('User-ID','user_id')\
.withColumnRenamed('ISBN','isbn')\
@kshirsagarsiddharth
kshirsagarsiddharth / load_data.py
Created June 8, 2021 04:53
recommendation loading data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('rec').getOrCreate()
books_rating_path = "book_recommendation/book_recommendation/BX-Book-Ratings.csv"
books_path = "book_recommendation/book_recommendation/BX-Books.csv"
books_user_path = "book_recommendation/book_recommendation/BX-Users.csv"
@kshirsagarsiddharth
kshirsagarsiddharth / find_quotient.py
Created October 16, 2020 12:34
find quotient of two numbers
def find_quotient(x,y):
# keep the value of power as large as possible
result,power = 0,16
# this is (2^k)y inttially defined
# in this case it is (2^16)y
y_power = y << power
# because we are successfully dividing we stop when
# x is less than y
while x >= y:
print("The Crawler is started")
base_url = input("Please Enter Website to Crawl > ")
number_of_threads = input("Please Enter number of Threads > ")
links_to_crawl = queue.Queue()
url_lock = threading.Lock()
links_to_crawl.put(base_url)
have_visited = set()
crawler_threads = []
def run(self):
# we create a ssl context so that our script can crawl
# the https sties with ssl_handshake.
#Create a SSLContext object with default settings.
my_ssl = ssl.create_default_context()
# by default when creating a default ssl context and making an handshake
# we verify the hostname with the certificate but our objective is to crawl
# the webpage so we will not be checking the validity of the cerfificate.